{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 任务目标：利用异烟酸生产过程中的各参数，预测最终异烟酸的收率\n",
    "- 数据集包括生产工程中10个步骤的参数，样本id、A1-A28、B1-B14包括原料、辅料、时间、温度、压强等以及收率\n",
    "- 冠军ATCG解决方案"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "![title](1.png)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import warnings\n",
    "import xgboost as xgb\n",
    "from sklearn.model_selection import KFold\n",
    "from sklearn.metrics import mean_squared_error as mse\n",
    "\n",
    "warnings.simplefilter('ignore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df_trn = pd.read_csv(\n",
    "        'data/jinnan_round1_train_20181227.csv', encoding='GB2312')\n",
    "df_tst_a = pd.read_csv(\n",
    "        'data/jinnan_round1_testA_20181227.csv', encoding='GB2312')\n",
    "df_tst_b = pd.read_csv(\n",
    "        'data/jinnan_round1_testB_20190121.csv', encoding='GB2312')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>样本id</th>\n",
       "      <th>A1</th>\n",
       "      <th>A2</th>\n",
       "      <th>A3</th>\n",
       "      <th>A4</th>\n",
       "      <th>A5</th>\n",
       "      <th>A6</th>\n",
       "      <th>A7</th>\n",
       "      <th>A8</th>\n",
       "      <th>A9</th>\n",
       "      <th>...</th>\n",
       "      <th>B6</th>\n",
       "      <th>B7</th>\n",
       "      <th>B8</th>\n",
       "      <th>B9</th>\n",
       "      <th>B10</th>\n",
       "      <th>B11</th>\n",
       "      <th>B12</th>\n",
       "      <th>B13</th>\n",
       "      <th>B14</th>\n",
       "      <th>收率</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>sample_1528</td>\n",
       "      <td>300</td>\n",
       "      <td>NaN</td>\n",
       "      <td>405.0</td>\n",
       "      <td>700</td>\n",
       "      <td>13:30:00</td>\n",
       "      <td>38.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>15:30:00</td>\n",
       "      <td>...</td>\n",
       "      <td>65</td>\n",
       "      <td>11:30:00</td>\n",
       "      <td>45.0</td>\n",
       "      <td>11:30-13:00</td>\n",
       "      <td>14:00-15:30</td>\n",
       "      <td>NaN</td>\n",
       "      <td>800.0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>400</td>\n",
       "      <td>0.879</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>sample_1698</td>\n",
       "      <td>300</td>\n",
       "      <td>NaN</td>\n",
       "      <td>405.0</td>\n",
       "      <td>700</td>\n",
       "      <td>14:00:00</td>\n",
       "      <td>29.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>16:00:00</td>\n",
       "      <td>...</td>\n",
       "      <td>80</td>\n",
       "      <td>6:00:00</td>\n",
       "      <td>45.0</td>\n",
       "      <td>6:00-7:30</td>\n",
       "      <td>7:30-9:00</td>\n",
       "      <td>9:00-10:00</td>\n",
       "      <td>1200.0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>400</td>\n",
       "      <td>0.902</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>sample_639</td>\n",
       "      <td>300</td>\n",
       "      <td>NaN</td>\n",
       "      <td>405.0</td>\n",
       "      <td>700</td>\n",
       "      <td>14:00:00</td>\n",
       "      <td>29.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>16:00:00</td>\n",
       "      <td>...</td>\n",
       "      <td>80</td>\n",
       "      <td>1:00:00</td>\n",
       "      <td>45.0</td>\n",
       "      <td>1:00-2:30</td>\n",
       "      <td>2:30-4:00</td>\n",
       "      <td>4:00-5:00</td>\n",
       "      <td>1200.0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>400</td>\n",
       "      <td>0.936</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>sample_483</td>\n",
       "      <td>300</td>\n",
       "      <td>NaN</td>\n",
       "      <td>405.0</td>\n",
       "      <td>700</td>\n",
       "      <td>1:30:00</td>\n",
       "      <td>38.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3:00:00</td>\n",
       "      <td>...</td>\n",
       "      <td>65</td>\n",
       "      <td>18:00:00</td>\n",
       "      <td>45.0</td>\n",
       "      <td>19:00-20:30</td>\n",
       "      <td>21:30-23:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>800.0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>400</td>\n",
       "      <td>0.902</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>sample_617</td>\n",
       "      <td>300</td>\n",
       "      <td>NaN</td>\n",
       "      <td>405.0</td>\n",
       "      <td>700</td>\n",
       "      <td>22:00:00</td>\n",
       "      <td>29.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0:00:00</td>\n",
       "      <td>...</td>\n",
       "      <td>80</td>\n",
       "      <td>9:00:00</td>\n",
       "      <td>45.0</td>\n",
       "      <td>9:00-10:30</td>\n",
       "      <td>10:30-12:00</td>\n",
       "      <td>12:00-13:00</td>\n",
       "      <td>1200.0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>420</td>\n",
       "      <td>0.983</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 44 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          样本id   A1  A2     A3   A4        A5    A6   A7  A8        A9  ...    \\\n",
       "0  sample_1528  300 NaN  405.0  700  13:30:00  38.0  NaN NaN  15:30:00  ...     \n",
       "1  sample_1698  300 NaN  405.0  700  14:00:00  29.0  NaN NaN  16:00:00  ...     \n",
       "2   sample_639  300 NaN  405.0  700  14:00:00  29.0  NaN NaN  16:00:00  ...     \n",
       "3   sample_483  300 NaN  405.0  700   1:30:00  38.0  NaN NaN   3:00:00  ...     \n",
       "4   sample_617  300 NaN  405.0  700  22:00:00  29.0  NaN NaN   0:00:00  ...     \n",
       "\n",
       "   B6        B7    B8           B9          B10          B11     B12   B13  \\\n",
       "0  65  11:30:00  45.0  11:30-13:00  14:00-15:30          NaN   800.0  0.15   \n",
       "1  80   6:00:00  45.0    6:00-7:30    7:30-9:00   9:00-10:00  1200.0  0.15   \n",
       "2  80   1:00:00  45.0    1:00-2:30    2:30-4:00    4:00-5:00  1200.0  0.15   \n",
       "3  65  18:00:00  45.0  19:00-20:30  21:30-23:00          NaN   800.0  0.15   \n",
       "4  80   9:00:00  45.0   9:00-10:30  10:30-12:00  12:00-13:00  1200.0  0.15   \n",
       "\n",
       "   B14     收率  \n",
       "0  400  0.879  \n",
       "1  400  0.902  \n",
       "2  400  0.936  \n",
       "3  400  0.902  \n",
       "4  420  0.983  \n",
       "\n",
       "[5 rows x 44 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_trn.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 1396 entries, 0 to 1395\n",
      "Data columns (total 44 columns):\n",
      "样本id    1396 non-null object\n",
      "A1      1396 non-null int64\n",
      "A2      42 non-null float64\n",
      "A3      1354 non-null float64\n",
      "A4      1396 non-null int64\n",
      "A5      1396 non-null object\n",
      "A6      1396 non-null float64\n",
      "A7      149 non-null object\n",
      "A8      149 non-null float64\n",
      "A9      1396 non-null object\n",
      "A10     1396 non-null int64\n",
      "A11     1396 non-null object\n",
      "A12     1396 non-null int64\n",
      "A13     1396 non-null float64\n",
      "A14     1396 non-null object\n",
      "A15     1396 non-null float64\n",
      "A16     1396 non-null object\n",
      "A17     1396 non-null float64\n",
      "A18     1396 non-null float64\n",
      "A19     1396 non-null int64\n",
      "A20     1396 non-null object\n",
      "A21     1393 non-null float64\n",
      "A22     1396 non-null float64\n",
      "A23     1393 non-null float64\n",
      "A24     1395 non-null object\n",
      "A25     1396 non-null object\n",
      "A26     1394 non-null object\n",
      "A27     1396 non-null int64\n",
      "A28     1396 non-null object\n",
      "B1      1386 non-null float64\n",
      "B2      1394 non-null float64\n",
      "B3      1394 non-null float64\n",
      "B4      1396 non-null object\n",
      "B5      1395 non-null object\n",
      "B6      1396 non-null int64\n",
      "B7      1396 non-null object\n",
      "B8      1395 non-null float64\n",
      "B9      1396 non-null object\n",
      "B10     1152 non-null object\n",
      "B11     547 non-null object\n",
      "B12     1395 non-null float64\n",
      "B13     1395 non-null float64\n",
      "B14     1396 non-null int64\n",
      "收率      1396 non-null float64\n",
      "dtypes: float64(18), int64(8), object(18)\n",
      "memory usage: 480.0+ KB\n"
     ]
    }
   ],
   "source": [
    "df_trn.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 数据检查与问题修正"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def train_abnormal_revise(data):\n",
    "    df_trn = data.copy() \n",
    "    df_trn.loc[(df_trn['A1'] == 200) & (df_trn['A3'] == 405), 'A1'] = 300\n",
    "    df_trn['A5'] = df_trn['A5'].replace('1900/1/21 0:00', '21:00:00')\n",
    "    df_trn['A5'] = df_trn['A5'].replace('1900/1/29 0:00', '14:00:00')\n",
    "    df_trn['A9'] = df_trn['A9'].replace('1900/1/9 7:00', '23:00:00')\n",
    "    df_trn['A9'] = df_trn['A9'].replace('700', '7:00:00')\n",
    "    df_trn['A11'] = df_trn['A11'].replace(':30:00', '00:30:00')\n",
    "    df_trn['A11'] = df_trn['A11'].replace('1900/1/1 2:30', '21:30:00')\n",
    "    df_trn['A16'] = df_trn['A16'].replace('1900/1/12 0:00', '12:00:00')\n",
    "    df_trn['A20'] = df_trn['A20'].replace('6:00-6:30分', '6:00-6:30')\n",
    "    df_trn['A20'] = df_trn['A20'].replace('18:30-15:00', '18:30-19:00')\n",
    "    df_trn['A22'] = df_trn['A22'].replace(3.5, np.nan)\n",
    "    df_trn['A25'] = df_trn['A25'].replace('1900/3/10 0:00', 70).astype(int)\n",
    "    df_trn['A26'] = df_trn['A26'].replace('1900/3/13 0:00', '13:00:00')\n",
    "    df_trn['B1'] = df_trn['B1'].replace(3.5, np.nan)\n",
    "    df_trn['B4'] = df_trn['B4'].replace('15:00-1600', '15:00-16:00')\n",
    "    df_trn['B4'] = df_trn['B4'].replace('18:00-17:00', '16:00-17:00')\n",
    "    df_trn['B4'] = df_trn['B4'].replace('19:-20:05', '19:05-20:05')\n",
    "    df_trn['B9'] = df_trn['B9'].replace('23:00-7:30', '23:00-00:30')\n",
    "    df_trn['B14'] = df_trn['B14'].replace(40, 400)\n",
    "    return df_trn\n",
    "\n",
    "\n",
    "def test_a_abnormal_revise(data):\n",
    "    df_tst = data.copy()\n",
    "    df_tst['A5'] = df_tst['A5'].replace('1900/1/22 0:00', '22:00:00')\n",
    "    df_tst['A7'] = df_tst['A7'].replace('0:50:00', '21:50:00')\n",
    "    df_tst['B14'] = df_tst['B14'].replace(785, 385)\n",
    "    return df_tst\n",
    "\n",
    "\n",
    "def train_abnormal_adjust(data):\n",
    "    df_trn = data.copy()\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_1894', 'A5'] = '14:00:00'\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_1234', 'A9'] = '0:00:00'\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_1020', 'A9'] = '18:30:00'\n",
    "\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_1380', 'A11'] = '15:30:00'\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_844', 'A11'] = '10:00:00'\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_1348', 'A11'] = '17:00:00'\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_25', 'A11'] = '00:30:00'\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_1105', 'A11'] = '4:00:00'\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_313', 'A11'] = '15:30:00'\n",
    "\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_291', 'A14'] = '19:30:00'\n",
    "\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_1398', 'A16'] = '11:00:00'\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_1177', 'A20'] = '19:00-20:00'\n",
    "\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_71', 'A20'] = '16:20-16:50'\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_14', 'A20'] = '18:00-18:30'\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_69', 'A20'] = '6:10-6:50'\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_1500', 'A20'] = '23:00-23:30'\n",
    "\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_1524', 'A24'] = '15:00:00'\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_1524', 'A26'] = '15:30:00'\n",
    "\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_1046', 'A28'] = '18:00-18:30'\n",
    "\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_1230', 'B5'] = '17:00:00'\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_97', 'B7'] = '1:00:00'\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_752', 'B9'] = '11:00-14:00'\n",
    "\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_609', 'B11'] = '11:00-12:00'\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_643', 'B11'] = '12:00-13:00'\n",
    "    df_trn.loc[df_trn['样本id'] == 'sample_1164', 'B11'] = '5:00-6:00'\n",
    "    return df_trn\n",
    "\n",
    "\n",
    "def test_a_abnormal_adjust(data):\n",
    "    df_tst = data.copy()\n",
    "    df_tst.loc[df_tst['样本id'] == 'sample_919', 'A9'] = '19:50:00'\n",
    "    return df_tst\n",
    "\n",
    "\n",
    "def test_b_abnormal_adjust(data):\n",
    "    df_tst = data.copy()\n",
    "    df_tst.loc[df_tst['样本id'] == 'sample_566', 'A5'] = '18:00:00'\n",
    "    df_tst.loc[df_tst['样本id'] == 'sample_40', 'A20'] = '5:00-5:30'\n",
    "    df_tst.loc[df_tst['样本id'] == 'sample_531', 'B5'] = '1:00'\n",
    "    return df_tst"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df_trn = train_abnormal_revise(df_trn).pipe(train_abnormal_adjust)\n",
    "df_tst_a = test_a_abnormal_revise(df_tst_a).pipe(test_a_abnormal_adjust)\n",
    "df_tst_b = test_b_abnormal_adjust(df_tst_b)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "#### 标签与数据集整合"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df_trn, df_tst = df_trn.copy(), df_tst_a.copy()\n",
    "df_target = df_trn['收率']\n",
    "del df_trn['收率']\n",
    "df_trn_tst = df_trn.append(df_tst, ignore_index=False).reset_index(\n",
    "    drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "for _df in [df_trn, df_tst, df_trn_tst]:\n",
    "    _df['A3'] = _df['A3'].fillna(405)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 时间段特征处理 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 所有时间相关列\n",
    "cols_timer = ['A5', 'A7', 'A9', 'A11', 'A14', 'A16', 'A24', 'A26', 'B5', 'B7']\n",
    "# 同时对训练和测试集进行相同处理\n",
    "for _df in [df_trn_tst, df_trn, df_tst]:\n",
    "    # 添加列名标记\n",
    "    _df.rename(columns={_col: _col + '_t' for _col in cols_timer},\n",
    "               inplace=True)\n",
    "    # 遍历所有持续时间相关列例如21:00-21:30\n",
    "    for _col in ['A20', 'A28', 'B4', 'B9', 'B10', 'B11']:\n",
    "        # 取到当前列的索引\n",
    "        _idx_col = _df.columns.tolist().index(_col)\n",
    "        # 添加新的一列，表示起始时间，split表示分别取开始和结束时间，用索引来指定\n",
    "        _df.insert(_idx_col + 1, _col + '_at',\n",
    "                   _df[_col].str.split('-').str[0])\n",
    "        # 添加新的一列，表示终止时间\n",
    "        _df.insert(_idx_col + 2, _col + '_bt',\n",
    "                   _df[_col].str.split('-').str[1])\n",
    "        # 删除持续时间\n",
    "        del _df[_col]\n",
    "        cols_timer = cols_timer + [_col + '_at', _col + '_bt']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>样本id</th>\n",
       "      <th>A1</th>\n",
       "      <th>A2</th>\n",
       "      <th>A3</th>\n",
       "      <th>A4</th>\n",
       "      <th>A5_t</th>\n",
       "      <th>A6</th>\n",
       "      <th>A7_t</th>\n",
       "      <th>A8</th>\n",
       "      <th>A9_t</th>\n",
       "      <th>...</th>\n",
       "      <th>B8</th>\n",
       "      <th>B9_at</th>\n",
       "      <th>B9_bt</th>\n",
       "      <th>B10_at</th>\n",
       "      <th>B10_bt</th>\n",
       "      <th>B11_at</th>\n",
       "      <th>B11_bt</th>\n",
       "      <th>B12</th>\n",
       "      <th>B13</th>\n",
       "      <th>B14</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>sample_1528</td>\n",
       "      <td>300</td>\n",
       "      <td>NaN</td>\n",
       "      <td>405.0</td>\n",
       "      <td>700</td>\n",
       "      <td>13:30:00</td>\n",
       "      <td>38.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>15:30:00</td>\n",
       "      <td>...</td>\n",
       "      <td>45.0</td>\n",
       "      <td>11:30</td>\n",
       "      <td>13:00</td>\n",
       "      <td>14:00</td>\n",
       "      <td>15:30</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>800.0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>sample_1698</td>\n",
       "      <td>300</td>\n",
       "      <td>NaN</td>\n",
       "      <td>405.0</td>\n",
       "      <td>700</td>\n",
       "      <td>14:00:00</td>\n",
       "      <td>29.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>16:00:00</td>\n",
       "      <td>...</td>\n",
       "      <td>45.0</td>\n",
       "      <td>6:00</td>\n",
       "      <td>7:30</td>\n",
       "      <td>7:30</td>\n",
       "      <td>9:00</td>\n",
       "      <td>9:00</td>\n",
       "      <td>10:00</td>\n",
       "      <td>1200.0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>sample_639</td>\n",
       "      <td>300</td>\n",
       "      <td>NaN</td>\n",
       "      <td>405.0</td>\n",
       "      <td>700</td>\n",
       "      <td>14:00:00</td>\n",
       "      <td>29.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>16:00:00</td>\n",
       "      <td>...</td>\n",
       "      <td>45.0</td>\n",
       "      <td>1:00</td>\n",
       "      <td>2:30</td>\n",
       "      <td>2:30</td>\n",
       "      <td>4:00</td>\n",
       "      <td>4:00</td>\n",
       "      <td>5:00</td>\n",
       "      <td>1200.0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>sample_483</td>\n",
       "      <td>300</td>\n",
       "      <td>NaN</td>\n",
       "      <td>405.0</td>\n",
       "      <td>700</td>\n",
       "      <td>1:30:00</td>\n",
       "      <td>38.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3:00:00</td>\n",
       "      <td>...</td>\n",
       "      <td>45.0</td>\n",
       "      <td>19:00</td>\n",
       "      <td>20:30</td>\n",
       "      <td>21:30</td>\n",
       "      <td>23:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>800.0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>sample_617</td>\n",
       "      <td>300</td>\n",
       "      <td>NaN</td>\n",
       "      <td>405.0</td>\n",
       "      <td>700</td>\n",
       "      <td>22:00:00</td>\n",
       "      <td>29.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0:00:00</td>\n",
       "      <td>...</td>\n",
       "      <td>45.0</td>\n",
       "      <td>9:00</td>\n",
       "      <td>10:30</td>\n",
       "      <td>10:30</td>\n",
       "      <td>12:00</td>\n",
       "      <td>12:00</td>\n",
       "      <td>13:00</td>\n",
       "      <td>1200.0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>420</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 49 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          样本id   A1  A2     A3   A4      A5_t    A6 A7_t  A8      A9_t ...   \\\n",
       "0  sample_1528  300 NaN  405.0  700  13:30:00  38.0  NaN NaN  15:30:00 ...    \n",
       "1  sample_1698  300 NaN  405.0  700  14:00:00  29.0  NaN NaN  16:00:00 ...    \n",
       "2   sample_639  300 NaN  405.0  700  14:00:00  29.0  NaN NaN  16:00:00 ...    \n",
       "3   sample_483  300 NaN  405.0  700   1:30:00  38.0  NaN NaN   3:00:00 ...    \n",
       "4   sample_617  300 NaN  405.0  700  22:00:00  29.0  NaN NaN   0:00:00 ...    \n",
       "\n",
       "     B8  B9_at  B9_bt  B10_at B10_bt  B11_at B11_bt     B12   B13  B14  \n",
       "0  45.0  11:30  13:00   14:00  15:30     NaN    NaN   800.0  0.15  400  \n",
       "1  45.0   6:00   7:30    7:30   9:00    9:00  10:00  1200.0  0.15  400  \n",
       "2  45.0   1:00   2:30    2:30   4:00    4:00   5:00  1200.0  0.15  400  \n",
       "3  45.0  19:00  20:30   21:30  23:00     NaN    NaN   800.0  0.15  400  \n",
       "4  45.0   9:00  10:30   10:30  12:00   12:00  13:00  1200.0  0.15  420  \n",
       "\n",
       "[5 rows x 49 columns]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_trn_tst.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "cols_timer = list(filter(lambda x: x.endswith('t'), df_trn_tst.columns))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['A5_t',\n",
       " 'A7_t',\n",
       " 'A9_t',\n",
       " 'A11_t',\n",
       " 'A14_t',\n",
       " 'A16_t',\n",
       " 'A20_at',\n",
       " 'A20_bt',\n",
       " 'A24_t',\n",
       " 'A26_t',\n",
       " 'A28_at',\n",
       " 'A28_bt',\n",
       " 'B4_at',\n",
       " 'B4_bt',\n",
       " 'B5_t',\n",
       " 'B7_t',\n",
       " 'B9_at',\n",
       " 'B9_bt',\n",
       " 'B10_at',\n",
       " 'B10_bt',\n",
       " 'B11_at',\n",
       " 'B11_bt']"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cols_timer"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "将时间全部转换成分钟形式"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def time_to_min(x):\n",
    "    if x is np.nan:\n",
    "        return np.nan\n",
    "    else:\n",
    "        x = x.replace(';', ':').replace('；', ':')\n",
    "        x = x.replace('::', ':').replace('\"', ':')\n",
    "        h, m = x.split(':')[:2]\n",
    "        h = 0 if not h else h\n",
    "        m = 0 if not m else m\n",
    "        return int(h)*60 + int(m)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "for _df in [df_trn_tst, df_trn, df_tst]:\n",
    "    for _col in cols_timer:\n",
    "        _df[_col] = _df[_col].map(time_to_min)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>样本id</th>\n",
       "      <th>A1</th>\n",
       "      <th>A2</th>\n",
       "      <th>A3</th>\n",
       "      <th>A4</th>\n",
       "      <th>A5_t</th>\n",
       "      <th>A6</th>\n",
       "      <th>A7_t</th>\n",
       "      <th>A8</th>\n",
       "      <th>A9_t</th>\n",
       "      <th>...</th>\n",
       "      <th>B8</th>\n",
       "      <th>B9_at</th>\n",
       "      <th>B9_bt</th>\n",
       "      <th>B10_at</th>\n",
       "      <th>B10_bt</th>\n",
       "      <th>B11_at</th>\n",
       "      <th>B11_bt</th>\n",
       "      <th>B12</th>\n",
       "      <th>B13</th>\n",
       "      <th>B14</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>sample_1528</td>\n",
       "      <td>300</td>\n",
       "      <td>NaN</td>\n",
       "      <td>405.0</td>\n",
       "      <td>700</td>\n",
       "      <td>810</td>\n",
       "      <td>38.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>930</td>\n",
       "      <td>...</td>\n",
       "      <td>45.0</td>\n",
       "      <td>690</td>\n",
       "      <td>780</td>\n",
       "      <td>840.0</td>\n",
       "      <td>930.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>800.0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>sample_1698</td>\n",
       "      <td>300</td>\n",
       "      <td>NaN</td>\n",
       "      <td>405.0</td>\n",
       "      <td>700</td>\n",
       "      <td>840</td>\n",
       "      <td>29.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>960</td>\n",
       "      <td>...</td>\n",
       "      <td>45.0</td>\n",
       "      <td>360</td>\n",
       "      <td>450</td>\n",
       "      <td>450.0</td>\n",
       "      <td>540.0</td>\n",
       "      <td>540.0</td>\n",
       "      <td>600.0</td>\n",
       "      <td>1200.0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>sample_639</td>\n",
       "      <td>300</td>\n",
       "      <td>NaN</td>\n",
       "      <td>405.0</td>\n",
       "      <td>700</td>\n",
       "      <td>840</td>\n",
       "      <td>29.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>960</td>\n",
       "      <td>...</td>\n",
       "      <td>45.0</td>\n",
       "      <td>60</td>\n",
       "      <td>150</td>\n",
       "      <td>150.0</td>\n",
       "      <td>240.0</td>\n",
       "      <td>240.0</td>\n",
       "      <td>300.0</td>\n",
       "      <td>1200.0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>sample_483</td>\n",
       "      <td>300</td>\n",
       "      <td>NaN</td>\n",
       "      <td>405.0</td>\n",
       "      <td>700</td>\n",
       "      <td>90</td>\n",
       "      <td>38.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>180</td>\n",
       "      <td>...</td>\n",
       "      <td>45.0</td>\n",
       "      <td>1140</td>\n",
       "      <td>1230</td>\n",
       "      <td>1290.0</td>\n",
       "      <td>1380.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>800.0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>sample_617</td>\n",
       "      <td>300</td>\n",
       "      <td>NaN</td>\n",
       "      <td>405.0</td>\n",
       "      <td>700</td>\n",
       "      <td>1320</td>\n",
       "      <td>29.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>45.0</td>\n",
       "      <td>540</td>\n",
       "      <td>630</td>\n",
       "      <td>630.0</td>\n",
       "      <td>720.0</td>\n",
       "      <td>720.0</td>\n",
       "      <td>780.0</td>\n",
       "      <td>1200.0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>420</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 49 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          样本id   A1  A2     A3   A4  A5_t    A6  A7_t  A8  A9_t ...     B8  \\\n",
       "0  sample_1528  300 NaN  405.0  700   810  38.0   NaN NaN   930 ...   45.0   \n",
       "1  sample_1698  300 NaN  405.0  700   840  29.0   NaN NaN   960 ...   45.0   \n",
       "2   sample_639  300 NaN  405.0  700   840  29.0   NaN NaN   960 ...   45.0   \n",
       "3   sample_483  300 NaN  405.0  700    90  38.0   NaN NaN   180 ...   45.0   \n",
       "4   sample_617  300 NaN  405.0  700  1320  29.0   NaN NaN     0 ...   45.0   \n",
       "\n",
       "   B9_at  B9_bt  B10_at  B10_bt  B11_at  B11_bt     B12   B13  B14  \n",
       "0    690    780   840.0   930.0     NaN     NaN   800.0  0.15  400  \n",
       "1    360    450   450.0   540.0   540.0   600.0  1200.0  0.15  400  \n",
       "2     60    150   150.0   240.0   240.0   300.0  1200.0  0.15  400  \n",
       "3   1140   1230  1290.0  1380.0     NaN     NaN   800.0  0.15  400  \n",
       "4    540    630   630.0   720.0   720.0   780.0  1200.0  0.15  420  \n",
       "\n",
       "[5 rows x 49 columns]"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_trn_tst.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 创建一个df来准备添加嗷嗷多特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>样本id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>sample_1528</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>sample_1698</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>sample_639</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>sample_483</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>sample_617</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          样本id\n",
       "0  sample_1528\n",
       "1  sample_1698\n",
       "2   sample_639\n",
       "3   sample_483\n",
       "4   sample_617"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw = df_trn_tst.copy()\n",
    "df = pd.DataFrame(raw['样本id'])\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "#### 温度相关特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 加热过程\n",
    "df['P1_S1_A6_0C'] = raw['A6']  # 容器初始温度\n",
    "df['P1_S2_A8_1C'] = raw['A8']  # 首次测温温度\n",
    "df['P1_S3_A10_2C'] = raw['A10']  # 准备水解温度\n",
    "df['P1_C1_C0_D'] = raw['A8'] - raw['A6']  # 测温温差\n",
    "df['P1_C2_C0_D'] = raw['A10'] - raw['A6']  # 初次沸腾温差\n",
    "\n",
    "# 水解过程\n",
    "df['P2_S1_A12_3C'] = raw['A12']  # 水解开始温度\n",
    "df['P2_S2_A15_4C'] = raw['A15']  # 水解过程测温温度\n",
    "df['P2_S3_A17_5C'] = raw['A17']  # 水解结束温度\n",
    "df['P2_C3_C0_D'] = raw['A12'] - raw['A6']  # 水解开始与初始温度温差\n",
    "df['P2_C3_C2_D'] = raw['A12'] - raw['A10']  # 水解开始前恒温温差\n",
    "df['P2_C4_C3_D'] = raw['A15'] - raw['A12']  # 水解过程中途温差\n",
    "df['P2_C5_C4_D'] = raw['A17'] - raw['A15']  # 水解结束中途温差\n",
    "df['P2_C5_C3_KD'] = raw['A17'] - raw['A12']  # 水解起止温差\n",
    "\n",
    "# 脱色过程\n",
    "df['P3_S2_A25_7C'] = raw['A25']  # 脱色保温开始温度\n",
    "df['P3_S3_A27_8C'] = raw['A27']  # 脱色保温结束温度\n",
    "df['P3_C7_C5_D'] = raw['A25'] - raw['A17']  # 降温温差\n",
    "df['P3_C8_C7_KD'] = raw['A27'] - raw['A25']  # 保温温差\n",
    "\n",
    "# 结晶过程\n",
    "df['P4_S2_B6_11C'] = raw['B6']  # 结晶开始温度\n",
    "df['P4_S3_B8_12C'] = raw['B8']  # 结晶结束温度\n",
    "df['P4_C11_C8_D'] = raw['B6'] - raw['A27']  # 脱色结束到结晶温差\n",
    "df['P4_C12_C11_KD'] = raw['B8'] - raw['B6']  # 结晶温差"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 温度相关统计特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "_funcs = ['mean', 'std', 'sum']\n",
    "# 遍历每一种统计指标\n",
    "for _func in _funcs:\n",
    "    # 对每一个样本计算各项指标\n",
    "    df[f'P2_C2-C5_{_func}'] = raw[['A10', 'A12', 'A15', 'A17']].\\\n",
    "        agg(_func, axis=1)  # 沸腾过程温度\n",
    "    df[f'P2_D3-D5_{_func}'] = \\\n",
    "        df[[f'P2_C{i}_C{i-1}_D' for i in range(3, 6)]].\\\n",
    "            abs().agg(_func, axis=1)  # 沸腾过程绝对温差\n",
    "    df[f'P2_C1-C12_KD_ABS_{_func}'] = \\\n",
    "        df[[_f for _f in df.columns if _f.endswith('KD')]].\\\n",
    "            abs().agg(_func, axis=1)  # 关键过程绝对温差\n",
    "    df[f'P2_C1-C12_D_{_func}'] = \\\n",
    "        df[[_f for _f in df.columns if _f.endswith('D')]].\\\n",
    "            abs().agg(_func, axis=1)  # 所有过程绝对温差\n",
    "    df[f'P2_LARGE_KD_{_func}'] = \\\n",
    "        df[['P2_C3_C0_D', 'P3_C7_C5_D', 'P4_C12_C11_KD']].\\\n",
    "            abs().agg(_func, axis=1)  # 大温差绝对温差"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>样本id</th>\n",
       "      <th>P1_S1_A6_0C</th>\n",
       "      <th>P1_S2_A8_1C</th>\n",
       "      <th>P1_S3_A10_2C</th>\n",
       "      <th>P1_C1_C0_D</th>\n",
       "      <th>P1_C2_C0_D</th>\n",
       "      <th>P2_S1_A12_3C</th>\n",
       "      <th>P2_S2_A15_4C</th>\n",
       "      <th>P2_S3_A17_5C</th>\n",
       "      <th>P2_C3_C0_D</th>\n",
       "      <th>...</th>\n",
       "      <th>P2_C2-C5_std</th>\n",
       "      <th>P2_D3-D5_std</th>\n",
       "      <th>P2_C1-C12_KD_ABS_std</th>\n",
       "      <th>P2_C1-C12_D_std</th>\n",
       "      <th>P2_LARGE_KD_std</th>\n",
       "      <th>P2_C2-C5_sum</th>\n",
       "      <th>P2_D3-D5_sum</th>\n",
       "      <th>P2_C1-C12_KD_ABS_sum</th>\n",
       "      <th>P2_C1-C12_D_sum</th>\n",
       "      <th>P2_LARGE_KD_sum</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>sample_1528</td>\n",
       "      <td>38.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>100</td>\n",
       "      <td>NaN</td>\n",
       "      <td>62.0</td>\n",
       "      <td>102.0</td>\n",
       "      <td>103.0</td>\n",
       "      <td>104.0</td>\n",
       "      <td>64.0</td>\n",
       "      <td>...</td>\n",
       "      <td>1.707825</td>\n",
       "      <td>0.57735</td>\n",
       "      <td>9.643651</td>\n",
       "      <td>24.928565</td>\n",
       "      <td>23.245071</td>\n",
       "      <td>409.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>27.0</td>\n",
       "      <td>191.0</td>\n",
       "      <td>113.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>sample_1698</td>\n",
       "      <td>29.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>101</td>\n",
       "      <td>NaN</td>\n",
       "      <td>72.0</td>\n",
       "      <td>103.0</td>\n",
       "      <td>104.0</td>\n",
       "      <td>105.0</td>\n",
       "      <td>74.0</td>\n",
       "      <td>...</td>\n",
       "      <td>1.707825</td>\n",
       "      <td>0.57735</td>\n",
       "      <td>17.785762</td>\n",
       "      <td>28.887521</td>\n",
       "      <td>25.890796</td>\n",
       "      <td>413.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>44.0</td>\n",
       "      <td>226.0</td>\n",
       "      <td>134.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>sample_639</td>\n",
       "      <td>29.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>102</td>\n",
       "      <td>NaN</td>\n",
       "      <td>73.0</td>\n",
       "      <td>103.0</td>\n",
       "      <td>104.0</td>\n",
       "      <td>105.0</td>\n",
       "      <td>74.0</td>\n",
       "      <td>...</td>\n",
       "      <td>1.290994</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>18.009257</td>\n",
       "      <td>29.231642</td>\n",
       "      <td>25.514702</td>\n",
       "      <td>414.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>43.0</td>\n",
       "      <td>226.0</td>\n",
       "      <td>135.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>sample_483</td>\n",
       "      <td>38.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>100</td>\n",
       "      <td>NaN</td>\n",
       "      <td>62.0</td>\n",
       "      <td>102.0</td>\n",
       "      <td>103.0</td>\n",
       "      <td>104.0</td>\n",
       "      <td>64.0</td>\n",
       "      <td>...</td>\n",
       "      <td>1.707825</td>\n",
       "      <td>0.57735</td>\n",
       "      <td>9.165151</td>\n",
       "      <td>24.617293</td>\n",
       "      <td>22.479620</td>\n",
       "      <td>409.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>30.0</td>\n",
       "      <td>207.0</td>\n",
       "      <td>118.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>sample_617</td>\n",
       "      <td>29.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>101</td>\n",
       "      <td>NaN</td>\n",
       "      <td>72.0</td>\n",
       "      <td>103.0</td>\n",
       "      <td>104.0</td>\n",
       "      <td>105.0</td>\n",
       "      <td>74.0</td>\n",
       "      <td>...</td>\n",
       "      <td>1.707825</td>\n",
       "      <td>0.57735</td>\n",
       "      <td>17.785762</td>\n",
       "      <td>28.887521</td>\n",
       "      <td>25.890796</td>\n",
       "      <td>413.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>44.0</td>\n",
       "      <td>226.0</td>\n",
       "      <td>134.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 37 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          样本id  P1_S1_A6_0C  P1_S2_A8_1C  P1_S3_A10_2C  P1_C1_C0_D  \\\n",
       "0  sample_1528         38.0          NaN           100         NaN   \n",
       "1  sample_1698         29.0          NaN           101         NaN   \n",
       "2   sample_639         29.0          NaN           102         NaN   \n",
       "3   sample_483         38.0          NaN           100         NaN   \n",
       "4   sample_617         29.0          NaN           101         NaN   \n",
       "\n",
       "   P1_C2_C0_D  P2_S1_A12_3C  P2_S2_A15_4C  P2_S3_A17_5C  P2_C3_C0_D  \\\n",
       "0        62.0         102.0         103.0         104.0        64.0   \n",
       "1        72.0         103.0         104.0         105.0        74.0   \n",
       "2        73.0         103.0         104.0         105.0        74.0   \n",
       "3        62.0         102.0         103.0         104.0        64.0   \n",
       "4        72.0         103.0         104.0         105.0        74.0   \n",
       "\n",
       "        ...         P2_C2-C5_std  P2_D3-D5_std  P2_C1-C12_KD_ABS_std  \\\n",
       "0       ...             1.707825       0.57735              9.643651   \n",
       "1       ...             1.707825       0.57735             17.785762   \n",
       "2       ...             1.290994       0.00000             18.009257   \n",
       "3       ...             1.707825       0.57735              9.165151   \n",
       "4       ...             1.707825       0.57735             17.785762   \n",
       "\n",
       "   P2_C1-C12_D_std  P2_LARGE_KD_std  P2_C2-C5_sum  P2_D3-D5_sum  \\\n",
       "0        24.928565        23.245071         409.0           4.0   \n",
       "1        28.887521        25.890796         413.0           4.0   \n",
       "2        29.231642        25.514702         414.0           3.0   \n",
       "3        24.617293        22.479620         409.0           4.0   \n",
       "4        28.887521        25.890796         413.0           4.0   \n",
       "\n",
       "   P2_C1-C12_KD_ABS_sum  P2_C1-C12_D_sum  P2_LARGE_KD_sum  \n",
       "0                  27.0            191.0            113.0  \n",
       "1                  44.0            226.0            134.0  \n",
       "2                  43.0            226.0            135.0  \n",
       "3                  30.0            207.0            118.0  \n",
       "4                  44.0            226.0            134.0  \n",
       "\n",
       "[5 rows x 37 columns]"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "得到温度相关特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df_temperature = df.set_index('样本id')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>P1_S1_A6_0C</th>\n",
       "      <th>P1_S2_A8_1C</th>\n",
       "      <th>P1_S3_A10_2C</th>\n",
       "      <th>P1_C1_C0_D</th>\n",
       "      <th>P1_C2_C0_D</th>\n",
       "      <th>P2_S1_A12_3C</th>\n",
       "      <th>P2_S2_A15_4C</th>\n",
       "      <th>P2_S3_A17_5C</th>\n",
       "      <th>P2_C3_C0_D</th>\n",
       "      <th>P2_C3_C2_D</th>\n",
       "      <th>...</th>\n",
       "      <th>P2_C2-C5_std</th>\n",
       "      <th>P2_D3-D5_std</th>\n",
       "      <th>P2_C1-C12_KD_ABS_std</th>\n",
       "      <th>P2_C1-C12_D_std</th>\n",
       "      <th>P2_LARGE_KD_std</th>\n",
       "      <th>P2_C2-C5_sum</th>\n",
       "      <th>P2_D3-D5_sum</th>\n",
       "      <th>P2_C1-C12_KD_ABS_sum</th>\n",
       "      <th>P2_C1-C12_D_sum</th>\n",
       "      <th>P2_LARGE_KD_sum</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>样本id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>sample_1528</th>\n",
       "      <td>38.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>100</td>\n",
       "      <td>NaN</td>\n",
       "      <td>62.0</td>\n",
       "      <td>102.0</td>\n",
       "      <td>103.0</td>\n",
       "      <td>104.0</td>\n",
       "      <td>64.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>...</td>\n",
       "      <td>1.707825</td>\n",
       "      <td>0.57735</td>\n",
       "      <td>9.643651</td>\n",
       "      <td>24.928565</td>\n",
       "      <td>23.245071</td>\n",
       "      <td>409.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>27.0</td>\n",
       "      <td>191.0</td>\n",
       "      <td>113.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sample_1698</th>\n",
       "      <td>29.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>101</td>\n",
       "      <td>NaN</td>\n",
       "      <td>72.0</td>\n",
       "      <td>103.0</td>\n",
       "      <td>104.0</td>\n",
       "      <td>105.0</td>\n",
       "      <td>74.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>...</td>\n",
       "      <td>1.707825</td>\n",
       "      <td>0.57735</td>\n",
       "      <td>17.785762</td>\n",
       "      <td>28.887521</td>\n",
       "      <td>25.890796</td>\n",
       "      <td>413.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>44.0</td>\n",
       "      <td>226.0</td>\n",
       "      <td>134.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sample_639</th>\n",
       "      <td>29.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>102</td>\n",
       "      <td>NaN</td>\n",
       "      <td>73.0</td>\n",
       "      <td>103.0</td>\n",
       "      <td>104.0</td>\n",
       "      <td>105.0</td>\n",
       "      <td>74.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>...</td>\n",
       "      <td>1.290994</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>18.009257</td>\n",
       "      <td>29.231642</td>\n",
       "      <td>25.514702</td>\n",
       "      <td>414.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>43.0</td>\n",
       "      <td>226.0</td>\n",
       "      <td>135.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sample_483</th>\n",
       "      <td>38.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>100</td>\n",
       "      <td>NaN</td>\n",
       "      <td>62.0</td>\n",
       "      <td>102.0</td>\n",
       "      <td>103.0</td>\n",
       "      <td>104.0</td>\n",
       "      <td>64.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>...</td>\n",
       "      <td>1.707825</td>\n",
       "      <td>0.57735</td>\n",
       "      <td>9.165151</td>\n",
       "      <td>24.617293</td>\n",
       "      <td>22.479620</td>\n",
       "      <td>409.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>30.0</td>\n",
       "      <td>207.0</td>\n",
       "      <td>118.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sample_617</th>\n",
       "      <td>29.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>101</td>\n",
       "      <td>NaN</td>\n",
       "      <td>72.0</td>\n",
       "      <td>103.0</td>\n",
       "      <td>104.0</td>\n",
       "      <td>105.0</td>\n",
       "      <td>74.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>...</td>\n",
       "      <td>1.707825</td>\n",
       "      <td>0.57735</td>\n",
       "      <td>17.785762</td>\n",
       "      <td>28.887521</td>\n",
       "      <td>25.890796</td>\n",
       "      <td>413.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>44.0</td>\n",
       "      <td>226.0</td>\n",
       "      <td>134.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 36 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "             P1_S1_A6_0C  P1_S2_A8_1C  P1_S3_A10_2C  P1_C1_C0_D  P1_C2_C0_D  \\\n",
       "样本id                                                                          \n",
       "sample_1528         38.0          NaN           100         NaN        62.0   \n",
       "sample_1698         29.0          NaN           101         NaN        72.0   \n",
       "sample_639          29.0          NaN           102         NaN        73.0   \n",
       "sample_483          38.0          NaN           100         NaN        62.0   \n",
       "sample_617          29.0          NaN           101         NaN        72.0   \n",
       "\n",
       "             P2_S1_A12_3C  P2_S2_A15_4C  P2_S3_A17_5C  P2_C3_C0_D  P2_C3_C2_D  \\\n",
       "样本id                                                                            \n",
       "sample_1528         102.0         103.0         104.0        64.0         2.0   \n",
       "sample_1698         103.0         104.0         105.0        74.0         2.0   \n",
       "sample_639          103.0         104.0         105.0        74.0         1.0   \n",
       "sample_483          102.0         103.0         104.0        64.0         2.0   \n",
       "sample_617          103.0         104.0         105.0        74.0         2.0   \n",
       "\n",
       "                  ...         P2_C2-C5_std  P2_D3-D5_std  \\\n",
       "样本id              ...                                      \n",
       "sample_1528       ...             1.707825       0.57735   \n",
       "sample_1698       ...             1.707825       0.57735   \n",
       "sample_639        ...             1.290994       0.00000   \n",
       "sample_483        ...             1.707825       0.57735   \n",
       "sample_617        ...             1.707825       0.57735   \n",
       "\n",
       "             P2_C1-C12_KD_ABS_std  P2_C1-C12_D_std  P2_LARGE_KD_std  \\\n",
       "样本id                                                                  \n",
       "sample_1528              9.643651        24.928565        23.245071   \n",
       "sample_1698             17.785762        28.887521        25.890796   \n",
       "sample_639              18.009257        29.231642        25.514702   \n",
       "sample_483               9.165151        24.617293        22.479620   \n",
       "sample_617              17.785762        28.887521        25.890796   \n",
       "\n",
       "             P2_C2-C5_sum  P2_D3-D5_sum  P2_C1-C12_KD_ABS_sum  \\\n",
       "样本id                                                            \n",
       "sample_1528         409.0           4.0                  27.0   \n",
       "sample_1698         413.0           4.0                  44.0   \n",
       "sample_639          414.0           3.0                  43.0   \n",
       "sample_483          409.0           4.0                  30.0   \n",
       "sample_617          413.0           4.0                  44.0   \n",
       "\n",
       "             P2_C1-C12_D_sum  P2_LARGE_KD_sum  \n",
       "样本id                                           \n",
       "sample_1528            191.0            113.0  \n",
       "sample_1698            226.0            134.0  \n",
       "sample_639             226.0            135.0  \n",
       "sample_483             207.0            118.0  \n",
       "sample_617             226.0            134.0  \n",
       "\n",
       "[5 rows x 36 columns]"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_temperature.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 时间相关特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 时间计算方式转换\n",
    "def duration_outer(series1, series2):\n",
    "    duration = series1 - series2\n",
    "    duration = np.where(duration < 0, duration + 24*60, duration)\n",
    "    duration = np.where(duration > 12*60, 24*60 - duration, duration)\n",
    "    duration = np.where(duration > 6*60, 12*60 - duration, duration)\n",
    "    return duration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw = df_trn_tst.copy()\n",
    "df = pd.DataFrame(raw['样本id'])\n",
    "# 加热过程\n",
    "df['P1_S1_A5_0T'] = raw['A5_t']  # 初始时刻\n",
    "df['P1_S2_A9_2T'] = raw['A9_t']  # 初始时刻\n",
    "df['P1_T1_T0_D'] = duration_outer(raw['A7_t'], raw['A5_t'])\n",
    "# 初次测温时间差\n",
    "df['P1_T2_T1_D'] = duration_outer(raw['A9_t'], raw['A7_t'])\n",
    "# 二次测温时间差\n",
    "df['P1_T2_T0_K_D'] = duration_outer(raw['A9_t'], raw['A5_t'])\n",
    "# 开始加热至沸腾时间差\n",
    "\n",
    "# 水解过程\n",
    "df['P2_S1_A11_3T'] = raw['A11_t']  # 水解开始时刻\n",
    "df['P2_S1_A16_5T'] = raw['A16_t']  # 水解结束时刻\n",
    "\n",
    "df['P2_T3_T0_K_D'] = duration_outer(raw['A11_t'], raw['A5_t'])\n",
    "# 开始加热至投料时间差\n",
    "df['P2_T3_T2_K_D'] = duration_outer(raw['A11_t'], raw['A9_t'])\n",
    "# 恒温至投料投料时间差\n",
    "# df['P2_T4_T3_D'] = raw['A14_t'] - raw['A11_t']  # 水解初次测温时间差\n",
    "# df['P2_T5_T4_D'] = raw['A16_t'] - raw['A14_t']  # 水解结束时间差\n",
    "df['P2_T5_T3_K_D'] = duration_outer(raw['A16_t'], raw['A11_t'])\n",
    "# 水解时间差\n",
    "\n",
    "# 脱色过程\n",
    "df['P3_S1_A20_6T'] = raw['A20_at']  # 中和开始时刻\n",
    "df['P3_S2_A25_7T'] = raw['A24_t']  # 保温时刻\n",
    "\n",
    "df['P3_T6_T5_K_D'] = duration_outer(raw['A20_at'], raw['A16_t'])\n",
    "# 水解结束至中和间歇时间\n",
    "df['P3_T6_T6_K_D'] = duration_outer(raw['A20_bt'], raw['A20_at'])\n",
    "# 酸碱度中和时间\n",
    "df['P3_T7_T6_D'] = duration_outer(raw['A24_t'], raw['A20_bt'])\n",
    "# 中和结束至脱色间歇时间\n",
    "df['P3_T8_T7_K_D'] = duration_outer(raw['A26_t'], raw['A24_t'])\n",
    "# 脱色保温时间\n",
    "df['P3_T9_T8_D'] = duration_outer(raw['A28_at'], raw['A26_t'])\n",
    "# 脱色至抽滤间歇时间\n",
    "df['P3_T9_T9_K_D'] = duration_outer(raw['A28_bt'], raw['A28_at'])\n",
    "# 抽滤时间\n",
    "df['P3_T9_T5_1D'] = duration_outer(raw['A28_bt'], raw['A16_t'])\n",
    "df['P3_T9_T6_2D'] = duration_outer(raw['A28_bt'], raw['A20_at'])\n",
    "# 脱色总时间\n",
    "\n",
    "# 结晶过程\n",
    "df['P4_S1_B4_10T'] = raw['B4_at']  # 酸化开始时刻\n",
    "df['P4_S2_B5_11T'] = raw['B5_t']  # 结晶开始时刻\n",
    "df['P4_S3_B7_12T'] = raw['B7_t']  # 结晶结束时刻\n",
    "\n",
    "df['P4_T10_T9_D'] = duration_outer(raw['B4_at'], raw['A28_bt'])\n",
    "# 抽滤结束至酸化间歇时间\n",
    "df['P4_T10_T10_K_D'] = duration_outer(raw['B4_bt'], raw['B4_at'])\n",
    "# 酸化时间\n",
    "df['P4_T11_T10_K_D'] = duration_outer(raw['B5_t'], raw['B4_bt'])\n",
    "# 酸化至结晶间歇时间\n",
    "df['P4_T12_T11_K_D'] = duration_outer(raw['B7_t'], raw['B5_t'])\n",
    "# 自然结晶时间\n",
    "df['P4_T12_T9_1D'] = duration_outer(raw['B7_t'], raw['A28_bt'])\n",
    "df['P4_T12_T10_2D'] = duration_outer(raw['B7_t'], raw['B4_at'])\n",
    "# 结晶总时间\n",
    "\n",
    "# 甩滤过程\n",
    "df['P5_S1_B9_13T'] = raw['B9_at']  # 甩滤开始时刻\n",
    "df['P5_S3_B12_15T'] = np.where(\n",
    "    raw['B11_bt'].isnull(),\n",
    "    np.where(raw['B10_bt'].isnull(), raw['B9_bt'], raw['B10_bt']),\n",
    "    raw['B11_bt'])  # 甩滤结束时刻\n",
    "df['P5_T13_T12_D'] = duration_outer(raw['B9_at'], raw['B7_t'])\n",
    "# 酸化结束至甩滤间歇时间\n",
    "df['P5_T13_T13_K_D'] = duration_outer(raw['B9_bt'], raw['B9_at'])\n",
    "# 基本甩滤时间\n",
    "df['P5_T14_T13_D'] = duration_outer(raw['B10_at'], raw['B9_bt'])\n",
    "# 基本甩滤至补充甩滤1间歇时间\n",
    "df['P5_T14_T14_K_D'] = duration_outer(raw['B10_bt'], raw['B10_at'])\n",
    "# 补充甩滤1时间\n",
    "df['P5_T15_T14_D'] = duration_outer(raw['B11_at'], raw['B10_bt'])\n",
    "# 补充甩滤1至补充甩滤2间歇时间\n",
    "df['P5_T15_T13_K_D'] = duration_outer(raw['B11_bt'], raw['B11_at'])\n",
    "# 补充甩滤2时间\n",
    "df['P5_T15_T13_1D'] = \\\n",
    "    df[['P5_T13_T13_K_D', 'P5_T14_T14_K_D', 'P5_T13_T13_K_D']].sum(axis=1)\n",
    "df['P5_T15_T12_2D'] = duration_outer(\n",
    "    df['P5_S3_B12_15T'], df['P4_S3_B7_12T'])\n",
    "df['P5_T15_T12_3D'] = duration_outer(\n",
    "    df['P5_S3_B12_15T'], df['P5_S1_B9_13T'])\n",
    "# 总甩滤时间\n",
    "\n",
    "# 总流程时长\n",
    "df['P5_T15_T1_4D'] = \\\n",
    "    df[['P5_T15_T12_2D', 'P4_T12_T9_1D', 'P3_T9_T5_1D',\n",
    "        'P2_T3_T0_K_D', 'P2_T5_T3_K_D']].sum(axis=1)\n",
    "_funcs = ['mean', 'std', 'sum']\n",
    "for _func in _funcs:\n",
    "    df[f'P5__D_{_func}'] = \\\n",
    "        df[[_f for _f in df.columns if _f.endswith('_D')]].\\\n",
    "            abs().agg(_func, axis=1)\n",
    "    df[f'P5_K_D_{_func}'] = \\\n",
    "        df[[_f for _f in df.columns if _f.endswith('_K_D')]]. \\\n",
    "            abs().agg(_func, axis=1)\n",
    "    df[f'P5__D_{_func}'] = \\\n",
    "        df[[_f for _f in df.columns if _f.endswith('D')]]. \\\n",
    "            abs().agg(_func, axis=1)\n",
    "df_duration = df.set_index('样本id')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>P1_S1_A5_0T</th>\n",
       "      <th>P1_S2_A9_2T</th>\n",
       "      <th>P1_T1_T0_D</th>\n",
       "      <th>P1_T2_T1_D</th>\n",
       "      <th>P1_T2_T0_K_D</th>\n",
       "      <th>P2_S1_A11_3T</th>\n",
       "      <th>P2_S1_A16_5T</th>\n",
       "      <th>P2_T3_T0_K_D</th>\n",
       "      <th>P2_T3_T2_K_D</th>\n",
       "      <th>P2_T5_T3_K_D</th>\n",
       "      <th>...</th>\n",
       "      <th>P5_T15_T13_1D</th>\n",
       "      <th>P5_T15_T12_2D</th>\n",
       "      <th>P5_T15_T12_3D</th>\n",
       "      <th>P5_T15_T1_4D</th>\n",
       "      <th>P5__D_mean</th>\n",
       "      <th>P5_K_D_mean</th>\n",
       "      <th>P5__D_std</th>\n",
       "      <th>P5_K_D_std</th>\n",
       "      <th>P5__D_sum</th>\n",
       "      <th>P5_K_D_sum</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>样本id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>sample_1528</th>\n",
       "      <td>810</td>\n",
       "      <td>930</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>120</td>\n",
       "      <td>990</td>\n",
       "      <td>1110</td>\n",
       "      <td>180</td>\n",
       "      <td>60</td>\n",
       "      <td>120</td>\n",
       "      <td>...</td>\n",
       "      <td>270.0</td>\n",
       "      <td>240.0</td>\n",
       "      <td>240.0</td>\n",
       "      <td>840.0</td>\n",
       "      <td>145.384615</td>\n",
       "      <td>90.000000</td>\n",
       "      <td>169.852425</td>\n",
       "      <td>63.639610</td>\n",
       "      <td>3780.0</td>\n",
       "      <td>1170.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sample_1698</th>\n",
       "      <td>840</td>\n",
       "      <td>960</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>120</td>\n",
       "      <td>1020</td>\n",
       "      <td>1140</td>\n",
       "      <td>180</td>\n",
       "      <td>60</td>\n",
       "      <td>120</td>\n",
       "      <td>...</td>\n",
       "      <td>270.0</td>\n",
       "      <td>240.0</td>\n",
       "      <td>240.0</td>\n",
       "      <td>960.0</td>\n",
       "      <td>136.071429</td>\n",
       "      <td>90.000000</td>\n",
       "      <td>188.588113</td>\n",
       "      <td>76.258669</td>\n",
       "      <td>3810.0</td>\n",
       "      <td>1260.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sample_639</th>\n",
       "      <td>840</td>\n",
       "      <td>960</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>120</td>\n",
       "      <td>1020</td>\n",
       "      <td>1140</td>\n",
       "      <td>180</td>\n",
       "      <td>60</td>\n",
       "      <td>120</td>\n",
       "      <td>...</td>\n",
       "      <td>270.0</td>\n",
       "      <td>240.0</td>\n",
       "      <td>240.0</td>\n",
       "      <td>900.0</td>\n",
       "      <td>123.214286</td>\n",
       "      <td>75.000000</td>\n",
       "      <td>173.654693</td>\n",
       "      <td>49.575118</td>\n",
       "      <td>3450.0</td>\n",
       "      <td>1050.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sample_483</th>\n",
       "      <td>90</td>\n",
       "      <td>180</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>90</td>\n",
       "      <td>240</td>\n",
       "      <td>360</td>\n",
       "      <td>150</td>\n",
       "      <td>60</td>\n",
       "      <td>120</td>\n",
       "      <td>...</td>\n",
       "      <td>270.0</td>\n",
       "      <td>300.0</td>\n",
       "      <td>240.0</td>\n",
       "      <td>990.0</td>\n",
       "      <td>158.076923</td>\n",
       "      <td>73.846154</td>\n",
       "      <td>195.448596</td>\n",
       "      <td>46.822086</td>\n",
       "      <td>4110.0</td>\n",
       "      <td>960.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sample_617</th>\n",
       "      <td>1320</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>120</td>\n",
       "      <td>60</td>\n",
       "      <td>180</td>\n",
       "      <td>180</td>\n",
       "      <td>60</td>\n",
       "      <td>120</td>\n",
       "      <td>...</td>\n",
       "      <td>270.0</td>\n",
       "      <td>240.0</td>\n",
       "      <td>240.0</td>\n",
       "      <td>900.0</td>\n",
       "      <td>123.214286</td>\n",
       "      <td>77.142857</td>\n",
       "      <td>173.846539</td>\n",
       "      <td>48.107024</td>\n",
       "      <td>3450.0</td>\n",
       "      <td>1080.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 47 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "             P1_S1_A5_0T  P1_S2_A9_2T  P1_T1_T0_D  P1_T2_T1_D  P1_T2_T0_K_D  \\\n",
       "样本id                                                                          \n",
       "sample_1528          810          930         NaN         NaN           120   \n",
       "sample_1698          840          960         NaN         NaN           120   \n",
       "sample_639           840          960         NaN         NaN           120   \n",
       "sample_483            90          180         NaN         NaN            90   \n",
       "sample_617          1320            0         NaN         NaN           120   \n",
       "\n",
       "             P2_S1_A11_3T  P2_S1_A16_5T  P2_T3_T0_K_D  P2_T3_T2_K_D  \\\n",
       "样本id                                                                  \n",
       "sample_1528           990          1110           180            60   \n",
       "sample_1698          1020          1140           180            60   \n",
       "sample_639           1020          1140           180            60   \n",
       "sample_483            240           360           150            60   \n",
       "sample_617             60           180           180            60   \n",
       "\n",
       "             P2_T5_T3_K_D     ...      P5_T15_T13_1D  P5_T15_T12_2D  \\\n",
       "样本id                          ...                                     \n",
       "sample_1528           120     ...              270.0          240.0   \n",
       "sample_1698           120     ...              270.0          240.0   \n",
       "sample_639            120     ...              270.0          240.0   \n",
       "sample_483            120     ...              270.0          300.0   \n",
       "sample_617            120     ...              270.0          240.0   \n",
       "\n",
       "             P5_T15_T12_3D  P5_T15_T1_4D  P5__D_mean  P5_K_D_mean   P5__D_std  \\\n",
       "样本id                                                                            \n",
       "sample_1528          240.0         840.0  145.384615    90.000000  169.852425   \n",
       "sample_1698          240.0         960.0  136.071429    90.000000  188.588113   \n",
       "sample_639           240.0         900.0  123.214286    75.000000  173.654693   \n",
       "sample_483           240.0         990.0  158.076923    73.846154  195.448596   \n",
       "sample_617           240.0         900.0  123.214286    77.142857  173.846539   \n",
       "\n",
       "             P5_K_D_std  P5__D_sum  P5_K_D_sum  \n",
       "样本id                                            \n",
       "sample_1528   63.639610     3780.0      1170.0  \n",
       "sample_1698   76.258669     3810.0      1260.0  \n",
       "sample_639    49.575118     3450.0      1050.0  \n",
       "sample_483    46.822086     4110.0       960.0  \n",
       "sample_617    48.107024     3450.0      1080.0  \n",
       "\n",
       "[5 rows x 47 columns]"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_duration.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 水耗相关特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "na_value=405\n",
    "\n",
    "df_trn_tst = df_trn_tst.copy()\n",
    "df = pd.DataFrame(raw['样本id'])\n",
    "# 耗水\n",
    "df['P2_W_1M'] = raw['A4']\n",
    "df['P2_W_2M'] = raw['A19']\n",
    "# 耗盐酸\n",
    "df['P3_H_1M'] = raw['A21'].fillna(50)\n",
    "df['P4_H_2M'] = raw['B1'].fillna(320)\n",
    "# 氢氧化钠\n",
    "df['P2_N_1M'] = raw['A3'].fillna(na_value)\n",
    "# 4-氰基吡啶\n",
    "df['P2_C_1M'] = raw['A1']\n",
    "\n",
    "df['P5_W_3M'] = raw['B12'].fillna(1200)\n",
    "df['P5_W_1M'] = df['P2_W_1M'] + df['P2_W_2M']\n",
    "df['P5_W_3M'] = df['P2_W_1M'] + df['P2_W_2M'] + df['P5_W_3M']\n",
    "df['P5_H_1M'] = df['P3_H_1M'] + df['P4_H_2M']\n",
    "df['P5_M_0M'] = raw['A1'] + df['P2_N_1M'] + df['P5_W_1M'] + df['P4_H_2M']\n",
    "df['P5_M_1M'] = df['P5_M_0M'] + df['P5_W_3M']\n",
    "df['P5_M_2M'] = df['P5_M_1M'] + df['P3_H_1M']\n",
    "# 理论产出\n",
    "df['P5_O_1M'] = raw['B14']\n",
    "df['P5_O_5M'] = raw['B14'].replace(418, 420).replace(405, 400).\\\n",
    "    replace(395, 390).replace(392, 390).replace(387, 380).\\\n",
    "    replace(385, 380).replace(370, 360).replace(350, 360).\\\n",
    "    replace(350, 360).replace(340, 360).replace(290, 280).\\\n",
    "    replace(260, 280).replace(256, 280)\n",
    "_fs = [_f for _f in df.columns if _f.endswith('M')]\n",
    "for _f in _fs[:-2]:\n",
    "    df[f'{_f}_P5_O_1M_R'] = df['P5_O_1M'] / df[_f]\n",
    "    df[f'{_f}_P5_O_5M_R'] = df['P5_O_5M'] / df[_f]\n",
    "for i in range(len(_fs[:6])):\n",
    "    _f, _sub_fs = _fs[i], _fs[(i+1):6]\n",
    "    for _f_div in _sub_fs:\n",
    "        df[f'{_f}_{_f_div}_R'] = df[_f] / df[_f_div]\n",
    "df_materials = df.set_index('样本id')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "raw = df_trn_tst.copy()\n",
    "df = pd.DataFrame(raw['样本id'])\n",
    "df['P5_NOT_NUM_N'] = raw.iloc[:, 1:-1].notnull().sum(axis=1)\n",
    "df['P5_PH_1N'] = raw['A22']\n",
    "df['P5_PH_2N'] = raw['A23']\n",
    "df['P5_PH_2N'] = raw['B2']\n",
    "df['P5_A7_1N'] = raw['A7_t'].isnull().astype(int)\n",
    "df['P5_O_2M'] = (raw['B14'] <= 360).astype(int)\n",
    "df['P5_1_3M'] = raw['B13']\n",
    "df_interact = df.set_index('样本id')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>P5_NOT_NUM_N</th>\n",
       "      <th>P5_PH_1N</th>\n",
       "      <th>P5_PH_2N</th>\n",
       "      <th>P5_A7_1N</th>\n",
       "      <th>P5_O_2M</th>\n",
       "      <th>P5_1_3M</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>样本id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>sample_1528</th>\n",
       "      <td>42</td>\n",
       "      <td>9.0</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sample_1698</th>\n",
       "      <td>44</td>\n",
       "      <td>9.0</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sample_639</th>\n",
       "      <td>44</td>\n",
       "      <td>9.0</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sample_483</th>\n",
       "      <td>42</td>\n",
       "      <td>10.0</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sample_617</th>\n",
       "      <td>44</td>\n",
       "      <td>9.0</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.15</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             P5_NOT_NUM_N  P5_PH_1N  P5_PH_2N  P5_A7_1N  P5_O_2M  P5_1_3M\n",
       "样本id                                                                     \n",
       "sample_1528            42       9.0       3.5         1        0     0.15\n",
       "sample_1698            44       9.0       3.5         1        0     0.15\n",
       "sample_639             44       9.0       3.5         1        0     0.15\n",
       "sample_483             42      10.0       3.5         1        0     0.15\n",
       "sample_617             44       9.0       3.5         1        0     0.15"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_interact.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 合并所有特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df_feature = pd.concat([df_materials, df_duration, df_temperature, df_interact], axis=1).reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df_trn = df_feature.iloc[:len(df_trn)].reset_index(drop=True)\n",
    "df_trn['收率'] = df_target\n",
    "df_tst = df_feature.iloc[len(df_trn):].reset_index(drop=True)\n",
    "df_tst['收率'] = np.nan"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>样本id</th>\n",
       "      <th>P2_W_1M</th>\n",
       "      <th>P2_W_2M</th>\n",
       "      <th>P3_H_1M</th>\n",
       "      <th>P4_H_2M</th>\n",
       "      <th>P2_N_1M</th>\n",
       "      <th>P2_C_1M</th>\n",
       "      <th>P5_W_3M</th>\n",
       "      <th>P5_W_1M</th>\n",
       "      <th>P5_H_1M</th>\n",
       "      <th>...</th>\n",
       "      <th>P2_C1-C12_KD_ABS_sum</th>\n",
       "      <th>P2_C1-C12_D_sum</th>\n",
       "      <th>P2_LARGE_KD_sum</th>\n",
       "      <th>P5_NOT_NUM_N</th>\n",
       "      <th>P5_PH_1N</th>\n",
       "      <th>P5_PH_2N</th>\n",
       "      <th>P5_A7_1N</th>\n",
       "      <th>P5_O_2M</th>\n",
       "      <th>P5_1_3M</th>\n",
       "      <th>收率</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>sample_1528</td>\n",
       "      <td>700</td>\n",
       "      <td>300</td>\n",
       "      <td>50.0</td>\n",
       "      <td>350.0</td>\n",
       "      <td>405.0</td>\n",
       "      <td>300</td>\n",
       "      <td>1800.0</td>\n",
       "      <td>1000</td>\n",
       "      <td>400.0</td>\n",
       "      <td>...</td>\n",
       "      <td>27.0</td>\n",
       "      <td>191.0</td>\n",
       "      <td>113.0</td>\n",
       "      <td>42</td>\n",
       "      <td>9.0</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>0.879</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>sample_1698</td>\n",
       "      <td>700</td>\n",
       "      <td>200</td>\n",
       "      <td>50.0</td>\n",
       "      <td>320.0</td>\n",
       "      <td>405.0</td>\n",
       "      <td>300</td>\n",
       "      <td>2100.0</td>\n",
       "      <td>900</td>\n",
       "      <td>370.0</td>\n",
       "      <td>...</td>\n",
       "      <td>44.0</td>\n",
       "      <td>226.0</td>\n",
       "      <td>134.0</td>\n",
       "      <td>44</td>\n",
       "      <td>9.0</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>0.902</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>sample_639</td>\n",
       "      <td>700</td>\n",
       "      <td>200</td>\n",
       "      <td>50.0</td>\n",
       "      <td>320.0</td>\n",
       "      <td>405.0</td>\n",
       "      <td>300</td>\n",
       "      <td>2100.0</td>\n",
       "      <td>900</td>\n",
       "      <td>370.0</td>\n",
       "      <td>...</td>\n",
       "      <td>43.0</td>\n",
       "      <td>226.0</td>\n",
       "      <td>135.0</td>\n",
       "      <td>44</td>\n",
       "      <td>9.0</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>0.936</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>sample_483</td>\n",
       "      <td>700</td>\n",
       "      <td>200</td>\n",
       "      <td>50.0</td>\n",
       "      <td>290.0</td>\n",
       "      <td>405.0</td>\n",
       "      <td>300</td>\n",
       "      <td>1700.0</td>\n",
       "      <td>900</td>\n",
       "      <td>340.0</td>\n",
       "      <td>...</td>\n",
       "      <td>30.0</td>\n",
       "      <td>207.0</td>\n",
       "      <td>118.0</td>\n",
       "      <td>42</td>\n",
       "      <td>10.0</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>0.902</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>sample_617</td>\n",
       "      <td>700</td>\n",
       "      <td>200</td>\n",
       "      <td>50.0</td>\n",
       "      <td>320.0</td>\n",
       "      <td>405.0</td>\n",
       "      <td>300</td>\n",
       "      <td>2100.0</td>\n",
       "      <td>900</td>\n",
       "      <td>370.0</td>\n",
       "      <td>...</td>\n",
       "      <td>44.0</td>\n",
       "      <td>226.0</td>\n",
       "      <td>134.0</td>\n",
       "      <td>44</td>\n",
       "      <td>9.0</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>0.983</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 144 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          样本id  P2_W_1M  P2_W_2M  P3_H_1M  P4_H_2M  P2_N_1M  P2_C_1M  P5_W_3M  \\\n",
       "0  sample_1528      700      300     50.0    350.0    405.0      300   1800.0   \n",
       "1  sample_1698      700      200     50.0    320.0    405.0      300   2100.0   \n",
       "2   sample_639      700      200     50.0    320.0    405.0      300   2100.0   \n",
       "3   sample_483      700      200     50.0    290.0    405.0      300   1700.0   \n",
       "4   sample_617      700      200     50.0    320.0    405.0      300   2100.0   \n",
       "\n",
       "   P5_W_1M  P5_H_1M  ...    P2_C1-C12_KD_ABS_sum  P2_C1-C12_D_sum  \\\n",
       "0     1000    400.0  ...                    27.0            191.0   \n",
       "1      900    370.0  ...                    44.0            226.0   \n",
       "2      900    370.0  ...                    43.0            226.0   \n",
       "3      900    340.0  ...                    30.0            207.0   \n",
       "4      900    370.0  ...                    44.0            226.0   \n",
       "\n",
       "   P2_LARGE_KD_sum  P5_NOT_NUM_N  P5_PH_1N  P5_PH_2N  P5_A7_1N  P5_O_2M  \\\n",
       "0            113.0            42       9.0       3.5         1        0   \n",
       "1            134.0            44       9.0       3.5         1        0   \n",
       "2            135.0            44       9.0       3.5         1        0   \n",
       "3            118.0            42      10.0       3.5         1        0   \n",
       "4            134.0            44       9.0       3.5         1        0   \n",
       "\n",
       "   P5_1_3M     收率  \n",
       "0     0.15  0.879  \n",
       "1     0.15  0.902  \n",
       "2     0.15  0.936  \n",
       "3     0.15  0.902  \n",
       "4     0.15  0.983  \n",
       "\n",
       "[5 rows x 144 columns]"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_trn.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "for _df in [df_trn, df_tst]:\n",
    "    _df.insert(1, 'id', _df['样本id'].str.split('_').str[1].astype(float))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>样本id</th>\n",
       "      <th>id</th>\n",
       "      <th>P2_W_1M</th>\n",
       "      <th>P2_W_2M</th>\n",
       "      <th>P3_H_1M</th>\n",
       "      <th>P4_H_2M</th>\n",
       "      <th>P2_N_1M</th>\n",
       "      <th>P2_C_1M</th>\n",
       "      <th>P5_W_3M</th>\n",
       "      <th>P5_W_1M</th>\n",
       "      <th>...</th>\n",
       "      <th>P2_C1-C12_KD_ABS_sum</th>\n",
       "      <th>P2_C1-C12_D_sum</th>\n",
       "      <th>P2_LARGE_KD_sum</th>\n",
       "      <th>P5_NOT_NUM_N</th>\n",
       "      <th>P5_PH_1N</th>\n",
       "      <th>P5_PH_2N</th>\n",
       "      <th>P5_A7_1N</th>\n",
       "      <th>P5_O_2M</th>\n",
       "      <th>P5_1_3M</th>\n",
       "      <th>收率</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>sample_1528</td>\n",
       "      <td>1528.0</td>\n",
       "      <td>700</td>\n",
       "      <td>300</td>\n",
       "      <td>50.0</td>\n",
       "      <td>350.0</td>\n",
       "      <td>405.0</td>\n",
       "      <td>300</td>\n",
       "      <td>1800.0</td>\n",
       "      <td>1000</td>\n",
       "      <td>...</td>\n",
       "      <td>27.0</td>\n",
       "      <td>191.0</td>\n",
       "      <td>113.0</td>\n",
       "      <td>42</td>\n",
       "      <td>9.0</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>0.879</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>sample_1698</td>\n",
       "      <td>1698.0</td>\n",
       "      <td>700</td>\n",
       "      <td>200</td>\n",
       "      <td>50.0</td>\n",
       "      <td>320.0</td>\n",
       "      <td>405.0</td>\n",
       "      <td>300</td>\n",
       "      <td>2100.0</td>\n",
       "      <td>900</td>\n",
       "      <td>...</td>\n",
       "      <td>44.0</td>\n",
       "      <td>226.0</td>\n",
       "      <td>134.0</td>\n",
       "      <td>44</td>\n",
       "      <td>9.0</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>0.902</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>sample_639</td>\n",
       "      <td>639.0</td>\n",
       "      <td>700</td>\n",
       "      <td>200</td>\n",
       "      <td>50.0</td>\n",
       "      <td>320.0</td>\n",
       "      <td>405.0</td>\n",
       "      <td>300</td>\n",
       "      <td>2100.0</td>\n",
       "      <td>900</td>\n",
       "      <td>...</td>\n",
       "      <td>43.0</td>\n",
       "      <td>226.0</td>\n",
       "      <td>135.0</td>\n",
       "      <td>44</td>\n",
       "      <td>9.0</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>0.936</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>sample_483</td>\n",
       "      <td>483.0</td>\n",
       "      <td>700</td>\n",
       "      <td>200</td>\n",
       "      <td>50.0</td>\n",
       "      <td>290.0</td>\n",
       "      <td>405.0</td>\n",
       "      <td>300</td>\n",
       "      <td>1700.0</td>\n",
       "      <td>900</td>\n",
       "      <td>...</td>\n",
       "      <td>30.0</td>\n",
       "      <td>207.0</td>\n",
       "      <td>118.0</td>\n",
       "      <td>42</td>\n",
       "      <td>10.0</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>0.902</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>sample_617</td>\n",
       "      <td>617.0</td>\n",
       "      <td>700</td>\n",
       "      <td>200</td>\n",
       "      <td>50.0</td>\n",
       "      <td>320.0</td>\n",
       "      <td>405.0</td>\n",
       "      <td>300</td>\n",
       "      <td>2100.0</td>\n",
       "      <td>900</td>\n",
       "      <td>...</td>\n",
       "      <td>44.0</td>\n",
       "      <td>226.0</td>\n",
       "      <td>134.0</td>\n",
       "      <td>44</td>\n",
       "      <td>9.0</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.15</td>\n",
       "      <td>0.983</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 145 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          样本id      id  P2_W_1M  P2_W_2M  P3_H_1M  P4_H_2M  P2_N_1M  P2_C_1M  \\\n",
       "0  sample_1528  1528.0      700      300     50.0    350.0    405.0      300   \n",
       "1  sample_1698  1698.0      700      200     50.0    320.0    405.0      300   \n",
       "2   sample_639   639.0      700      200     50.0    320.0    405.0      300   \n",
       "3   sample_483   483.0      700      200     50.0    290.0    405.0      300   \n",
       "4   sample_617   617.0      700      200     50.0    320.0    405.0      300   \n",
       "\n",
       "   P5_W_3M  P5_W_1M  ...    P2_C1-C12_KD_ABS_sum  P2_C1-C12_D_sum  \\\n",
       "0   1800.0     1000  ...                    27.0            191.0   \n",
       "1   2100.0      900  ...                    44.0            226.0   \n",
       "2   2100.0      900  ...                    43.0            226.0   \n",
       "3   1700.0      900  ...                    30.0            207.0   \n",
       "4   2100.0      900  ...                    44.0            226.0   \n",
       "\n",
       "   P2_LARGE_KD_sum  P5_NOT_NUM_N  P5_PH_1N  P5_PH_2N  P5_A7_1N  P5_O_2M  \\\n",
       "0            113.0            42       9.0       3.5         1        0   \n",
       "1            134.0            44       9.0       3.5         1        0   \n",
       "2            135.0            44       9.0       3.5         1        0   \n",
       "3            118.0            42      10.0       3.5         1        0   \n",
       "4            134.0            44       9.0       3.5         1        0   \n",
       "\n",
       "   P5_1_3M     收率  \n",
       "0     0.15  0.879  \n",
       "1     0.15  0.902  \n",
       "2     0.15  0.936  \n",
       "3     0.15  0.902  \n",
       "4     0.15  0.983  \n",
       "\n",
       "[5 rows x 145 columns]"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_trn.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "筛选常规数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAD8CAYAAABthzNFAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAE5ZJREFUeJzt3X+wZ3V93/HnS34jyo9wobiwLpo1\ndZOpYFaG1loJRKNMEzQJCeSHG0rdTMUmTm2mSJ1KOmUmbVUax44VAxFoDAF/0oQEF4pxkgFxEeTX\naliRwroMbCKCiAHBd//4nqs3y2fvPXfvPd/vd7nPx8x37ud8zud8v+/P3Lu8OD++56SqkCRpZ8+b\ndAGSpOlkQEiSmgwISVKTASFJajIgJElNBoQkqcmAkCQ1GRCSpCYDQpLUtPekC1iKww8/vNasWTPp\nMiRpj3LLLbf8bVXNLDRujw6INWvWsHnz5kmXIUl7lCT/r884DzFJkpoMCElSkwEhSWoyICRJTQaE\nJKnJgJAkNRkQkqQmA0KS1GRASJKa9uhvUmuRzj94gp/96OQ+W9JucQ9CktRkQEiSmgwISVKTASFJ\najIgJElNgwVEkv2T3Jzky0nuSvK7Xf+xSb6Q5J4kf5Jk365/v255a7d+zVC1SZIWNuQexJPAyVX1\nCuA44A1JTgT+K3BhVa0FHgHO7safDTxSVT8KXNiNkyRNyGABUSOPd4v7dK8CTgY+3vVfCrypa5/W\nLdOtPyVJhqpPkjS/Qc9BJNkryW3Aw8Am4GvAt6rq6W7INmBV114FPADQrX8U+JEh65Mk7dqgAVFV\nz1TVccDRwAnAy1vDup+tvYXauSPJxiSbk2zesWPH8hUrSfoHxnIVU1V9C/gccCJwSJLZW3wcDWzv\n2tuAYwC69QcD32y810VVtb6q1s/MzAxduiStWENexTST5JCufQDw08AW4AbgF7thG4DPdO2ru2W6\n9f+3qp61ByFJGo8hb9Z3FHBpkr0YBdGVVfWnSe4GrkjyX4BbgYu78RcDlyfZymjP4YwBa5MkLWCw\ngKiq24HjG/33MjofsXP/3wOnD1WPJGlx/Ca1JKnJgJAkNRkQkqQmA0KS1GRASJKaDAhJUpMBIUlq\nMiAkSU0GhCSpyYCQJDUNeS8maWU7/+AJfe6jk/lcPee4ByFJajIgJElNBoQkqcmAkCQ1GRCSpCYD\nQpLUZEBIkpoMCElSkwEhSWoyICRJTQaEJKnJgJAkNRkQkqSmwQIiyTFJbkiyJcldSX676z8/yTeS\n3Na9Tp2zzbuSbE3y1SQ/M1RtkqSFDXm776eBd1bVl5K8ALglyaZu3YVV9d65g5OsA84Afhx4EXBd\nkpdV1TMD1ihJ2oXB9iCq6sGq+lLX/jawBVg1zyanAVdU1ZNV9XVgK3DCUPVJkuY3lnMQSdYAxwNf\n6LrenuT2JJckObTrWwU8MGezbTQCJcnGJJuTbN6xY8eAVUvSyjZ4QCQ5CPgE8I6qegz4EPBS4Djg\nQeB9s0Mbm9ezOqouqqr1VbV+ZmZmoKolSYMGRJJ9GIXDH1XVJwGq6qGqeqaqvg98hB8eRtoGHDNn\n86OB7UPWJ0natSGvYgpwMbClqt4/p/+oOcPeDNzZta8GzkiyX5JjgbXAzUPVJ0ma35BXMb0a+HXg\njiS3dX3nAWcmOY7R4aP7gN8EqKq7klwJ3M3oCqhzvIJJkiZnsICoqr+ifV7hmnm2uQC4YKiaJEn9\n+U1qSVKTASFJajIgJElNBoQkqcmAkCQ1GRCSpCYDQpLUZEBIkpoMCElSkwEhSWoyICRJTQaEJKnJ\ngJAkNRkQkqQmA0KS1GRASJKaDAhJUpMBIUlqMiAkSU0GhCSpyYCQJDUZEJKkJgNCktRkQEiSmgYL\niCTHJLkhyZYkdyX57a7/sCSbktzT/Ty060+SDyTZmuT2JK8cqjZJ0sKG3IN4GnhnVb0cOBE4J8k6\n4Fzg+qpaC1zfLQO8EVjbvTYCHxqwNknSAnoFRJKfWOwbV9WDVfWlrv1tYAuwCjgNuLQbdinwpq59\nGnBZjdwEHJLkqMV+riRpefTdg/hfSW5O8rYkhyz2Q5KsAY4HvgAcWVUPwihEgCO6YauAB+Zstq3r\n2/m9NibZnGTzjh07FluKJKmnXgFRVf8c+FXgGGBzko8leV2fbZMcBHwCeEdVPTbf0NZHN2q5qKrW\nV9X6mZmZPiVIknZD73MQVXUP8G7gPwCvBT6Q5CtJfn5X2yTZh1E4/FFVfbLrfmj20FH38+Gufxuj\nAJp1NLC9b32SpOXV9xzEP0lyIaPzCCcDP9udfD4ZuHAX2wS4GNhSVe+fs+pqYEPX3gB8Zk7/W7qr\nmU4EHp09FCVJGr+9e477IPAR4Lyq+u5sZ1VtT/LuXWzzauDXgTuS3Nb1nQf8HnBlkrOB+4HTu3XX\nAKcCW4EngLMWMxFJ0vLqGxCnAt+tqmcAkjwP2L+qnqiqy1sbVNVf0T6vAHBKY3wB5/SsR5I0sL7n\nIK4DDpizfGDXJ0l6juobEPtX1eOzC137wGFKkiRNg74B8Z25t75I8pPAd+cZL0naw/U9B/EO4Kok\ns5edHgX88jAlSZKmQa+AqKovJvnHwI8xOvH8lar63qCVSZImqu8eBMCrgDXdNscnoaouG6QqSdLE\n9QqIJJcDLwVuA57pugswICTpOarvHsR6YF33XQVJ0grQ9yqmO4F/NGQhkqTp0ncP4nDg7iQ3A0/O\ndlbVzw1SlSRp4voGxPlDFiFJmj59L3P9yyQvBtZW1XVJDgT2GrY0SdIk9b3d91uBjwMf7rpWAZ8e\nqihJ0uT1PUl9DqPbdz8GP3h40BHzbiFJ2qP1DYgnq+qp2YUke9N4HKgk6bmjb0D8ZZLzgAO6Z1Ff\nBfyf4cqSJE1a34A4F9gB3AH8JqOnv+3qSXKSpOeAvlcxfZ/RI0c/Mmw5kqRp0fdeTF+ncc6hql6y\n7BVJkqbCYu7FNGt/4HTgsOUvR5I0LXqdg6iqv5vz+kZV/Q/g5IFrkyRNUN9DTK+cs/g8RnsULxik\nIknSVOh7iOl9c9pPA/cBv7Ts1UiSpkbfq5h+auhCJEnTpe8hpn833/qqen9jm0uAfwk8XFU/0fWd\nD7yV0XcqAM6rqmu6de8Czmb0xLrfqqpre85BkjSAxVzF9Crg6m75Z4HPAw/Ms81HgQ/y7MeSXlhV\n753bkWQdcAbw48CLgOuSvKyqnkHS4px/8IQ+99HJfK4Gs5gHBr2yqr4NP9gTuKqq/vWuNqiqzydZ\n0/P9TwOuqKonga8n2QqcANzYc3tJ0jLrGxCrgafmLD8FrNnNz3x7krcAm4F3VtUjjG4fftOcMdu6\nvmdJshHYCLB69erdLGHCJvV/eJK0CH3vxXQ5cHOS85O8B/gCzz501MeHgJcCxwEP8sOro9IY27xb\nbFVdVFXrq2r9zMzMbpQgSeqj71VMFyT5c+A1XddZVXXrYj+sqh6abSf5CPCn3eI24Jg5Q48Gti/2\n/SVJy6fvHgTAgcBjVfX7wLYkxy72w5IcNWfxzcCdXftq4Iwk+3Xvuxa4ebHvL0laPn0vc30PoyuZ\nfgz4Q2Af4H8zesrcrrb5Y+Ak4PAk24D3ACclOY7R4aP7GN06nKq6K8mVwN2Mvoh3jlcwSdJk9T1J\n/WbgeOBLAFW1Pcm8t9qoqjMb3RfPM/4C4IKe9UiSBtY3IJ6qqkpSAEmeP2BN0vLxijFpt/U9B3Fl\nkg8DhyR5K3AdPjxIkp7T+l7F9N7uWdSPMToP8Z+qatOglUmSJmrBgEiyF3BtVf00YChI0gqx4CGm\n7mqiJ5J4MFeSVpC+J6n/HrgjySbgO7OdVfVbg1QlSZq4vgHxZ91LkrRCzBsQSVZX1f1Vdem4CpIk\nTYeFzkF8eraR5BMD1yJJmiILBcTcu6y+ZMhCJEnTZaGAqF20JUnPcQudpH5FkscY7Ukc0LXplquq\nXjhodZKkiZk3IKpqr3EVIkmaLot5HoQkaQUxICRJTQaEJKnJgJAkNRkQkqQmA0KS1GRASJKaDAhJ\nUpMBIUlqMiAkSU2DBUSSS5I8nOTOOX2HJdmU5J7u56Fdf5J8IMnWJLcneeVQdUmS+hlyD+KjwBt2\n6jsXuL6q1gLXd8sAbwTWdq+NwIcGrEuS1MNgAVFVnwe+uVP3acDs0+kuBd40p/+yGrkJOCTJUUPV\nJkla2LjPQRxZVQ8CdD+P6PpXAQ/MGbet65MkTci0nKROo6/5gKIkG5NsTrJ5x44dA5clSSvXuAPi\nodlDR93Ph7v+bcAxc8YdDWxvvUFVXVRV66tq/czMzKDFStJKNu6AuBrY0LU3AJ+Z0/+W7mqmE4FH\nZw9FSZImY6FHju62JH8MnAQcnmQb8B7g94Ark5wN3A+c3g2/BjgV2Ao8AZw1VF2SpH4GC4iqOnMX\nq05pjC3gnKFqkSQt3rScpJYkTRkDQpLUZEBIkpoGOwchSWNz/sET+txHJ/O5Y+IehCSpyYCQJDUZ\nEJKkJgNCktRkQEiSmgwISVKTASFJalq534OY1HXTkrSHcA9CktRkQEiSmgwISVLTyj0HIUlLNclz\nmWO4D5R7EJKkJgNCktTkISaNh5cVS3sc9yAkSU0GhCSpyYCQJDUZEJKkJgNCktQ0kauYktwHfBt4\nBni6qtYnOQz4E2ANcB/wS1X1yCTqkyRNdg/ip6rquKpa3y2fC1xfVWuB67tlSdKETNMhptOAS7v2\npcCbJliLJK14kwqIAj6b5JYkG7u+I6vqQYDu5xETqk2SxOS+Sf3qqtqe5AhgU5Kv9N2wC5SNAKtX\nrx6qPkla8SayB1FV27ufDwOfAk4AHkpyFED38+FdbHtRVa2vqvUzMzPjKlmSVpyxB0SS5yd5wWwb\neD1wJ3A1sKEbtgH4zLhrkyT90CQOMR0JfCrJ7Od/rKr+IskXgSuTnA3cD5w+gdokSZ2xB0RV3Qu8\notH/d8Ap465HktQ2TZe5SpKmiAEhSWrygUGSlocPhXrOcQ9CktRkQEiSmgwISVKTASFJajIgJElN\nBoQkqcmAkCQ1GRCSpCYDQpLUZEBIkpoMCElSkwEhSWoyICRJTQaEJKnJgJAkNRkQkqQmA0KS1GRA\nSJKaDAhJUpMBIUlqMiAkSU1TFxBJ3pDkq0m2Jjl30vVI0ko1VQGRZC/gfwJvBNYBZyZZN9mqJGll\nmqqAAE4AtlbVvVX1FHAFcNqEa5KkFWnaAmIV8MCc5W1dnyRpzPaedAE7SaOv/sGAZCOwsVt8PMlX\nB69qGIcDfzvpIpaR85luzme6LX4+v9v6z2VvL+4zaNoCYhtwzJzlo4HtcwdU1UXAReMsaghJNlfV\n+knXsVycz3RzPtNtWuczbYeYvgisTXJskn2BM4CrJ1yTJK1IU7UHUVVPJ3k7cC2wF3BJVd014bIk\naUWaqoAAqKprgGsmXccY7PGHyXbifKab85luUzmfVNXCoyRJK860nYOQJE0JA2KZLXSrkCSrk9yQ\n5NYktyc5tevfJ8mlSe5IsiXJu8Zf/bP1mM+Lk1zfzeVzSY6es25Dknu614bxVt62u/NJclySG5Pc\n1a375fFX/2xL+f1061+Y5BtJPji+qndtiX9vq5N8tvv3c3eSNeOsvWWJ8/lv3d/bliQfSLKk61p3\nS1X5WqYXoxPrXwNeAuwLfBlYt9OYi4B/07XXAfd17V8BrujaBwL3AWv2gPlcBWzo2icDl3ftw4B7\nu5+Hdu1D9+D5vAxY27VfBDwIHLKnzmfO+t8HPgZ8cJJzWY75AJ8DXte1DwIO3FPnA/wz4K+799gL\nuBE4adxzcA9iefW5VUgBL+zaB/PD73kU8PwkewMHAE8Bjw1f8rz6zGcdcH3XvmHO+p8BNlXVN6vq\nEWAT8IYx1Dyf3Z5PVf1NVd3TtbcDDwMzY6l615by+yHJTwJHAp8dQ6197PZ8unu27V1VmwCq6vGq\nemI8Ze/SUn4/BezPKFj2A/YBHhq84p0YEMurz61Czgd+Lck2Rldr/duu/+PAdxj9n+n9wHur6puD\nVruwPvP5MvALXfvNwAuS/EjPbcdtKfP5gSQnMPqH+7WB6uxrt+eT5HnA+4DfGbzK/pby+3kZ8K0k\nn+wO3/737uafk7Tb86mqGxkFxoPd69qq2jJwvc9iQCyvBW8VApwJfLSqjgZOBS7v/rGeADzD6PDF\nscA7k7xkyGJ76DOffw+8NsmtwGuBbwBP99x23JYyn9EbJEcBlwNnVdX3hyq0p6XM523ANVX1ANNj\nKfPZG3hNt/5VjA7r/MZglfaz2/NJ8qPAyxndTWIVcHKSfzFksS1T9z2IPdyCtwoBzqY71FJVNybZ\nn9F9WH4F+Iuq+h7wcJK/BtYzOnY/KX1ufbId+HmAJAcBv1BVj3Z7SCfttO3nhiy2h92eT7f8QuDP\ngHdX1U1jqXh+S/n9/FPgNUnexuh4/b5JHq+qST6DZal/b7dW1b3duk8DJwIXj6PwXVjKfDYCN1XV\n4926P2c0n8+Po/BZ7kEsrz63CrkfOAUgycsZHWfc0fWfnJHnM/pj+MrYKm9bcD5JDu/2gADeBVzS\nta8FXp/k0CSHAq/v+iZpt+fTjf8UcFlVXTXGmuez2/Opql+tqtVVtYbR/8VeNuFwgKX9vX0RODTJ\n7Hmhk4G7x1DzfJYyn/sZ7VnsnWQfRnsXYz/ENNGrFp6LL0aHjf6G0fHp/9j1/Wfg57r2OkZXJ3wZ\nuA14fdd/EKMrGu5i9If9O5OeS8/5/CJwTzfmD4D95mz7r4Ct3eusSc9lKfMBfg34Xvc7m30dt6fO\nZ6f3+A2m4CqmZfh7ex1wO3AH8FFg3z11PoyuXPowo1C4G3j/JOr3m9SSpCYPMUmSmgwISVKTASFJ\najIgJElNBoQkqcmAkCQ1GRCSpCYDQpLU9P8BD4jLG/9xCxsAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "df_trn['收率'].plot(kind='hist')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df_trn = df_trn.query('收率 > 0.8671').reset_index(drop=True)\n",
    "df_trn = df_trn.query('收率 < 0.9861').reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "### 训练模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def xgb_cv(train, test, params, fit_params, feature_names, nfold, seed):\n",
    "    # 创建结果df\n",
    "    train_pred = pd.DataFrame({\n",
    "        'id': train['样本id'],\n",
    "        'true': train['收率'],\n",
    "        'pred': np.zeros(len(train))})\n",
    "    # 测试提交结果\n",
    "    test_pred = pd.DataFrame({'id': test['样本id'], 'pred': np.zeros(len(test))})\n",
    "    # 交叉验证\n",
    "    kfolder = KFold(n_splits=nfold, shuffle=True, random_state=seed)\n",
    "    # 构造测试DMatrix\n",
    "    xgb_tst = xgb.DMatrix(data=test[feature_names])\n",
    "    print('\\n')\n",
    "    # 遍历cv中每一折数据，通过索引来指定\n",
    "    for fold_id, (trn_idx, val_idx) in enumerate(kfolder.split(train['收率'])):\n",
    "        # 构造当前训练的DMatrix\n",
    "        xgb_trn = xgb.DMatrix(\n",
    "            train.iloc[trn_idx][feature_names],\n",
    "            train.iloc[trn_idx]['收率'])\n",
    "        # 构造当前验证的DMatrix\n",
    "        xgb_val = xgb.DMatrix(\n",
    "            train.iloc[val_idx][feature_names],\n",
    "            train.iloc[val_idx]['收率'])\n",
    "        # 训练回归模型\n",
    "        xgb_reg = xgb.train(params=params, dtrain=xgb_trn, **fit_params,\n",
    "                  evals=[(xgb_trn, 'train'), (xgb_val, 'valid')])\n",
    "        # 得到验证结果\n",
    "        val_pred = xgb_reg.predict(\n",
    "            xgb.DMatrix(train.iloc[val_idx][feature_names]),\n",
    "            ntree_limit=xgb_reg.best_ntree_limit)\n",
    "        train_pred.loc[val_idx, 'pred'] = val_pred\n",
    "        # print(f'Fold_{fold_id}', mse(train.iloc[val_idx]['收率'], val_pred))\n",
    "        test_pred['pred'] += xgb_reg.predict(\n",
    "            xgb_tst, ntree_limit=xgb_reg.best_ntree_limit) / nfold\n",
    "    print('\\nCV LOSS:', mse(train_pred['true'], train_pred['pred']), '\\n')\n",
    "    return test_pred\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "设置训练参数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "fit_params = {'num_boost_round': 10800,\n",
    "              'verbose_eval': 300,\n",
    "              'early_stopping_rounds': 360}\n",
    "params_xgb = {'eta': 0.01, 'max_depth': 7, 'subsample': 0.8,\n",
    "              'booster': 'gbtree', 'colsample_bytree': 0.8,\n",
    "              'objective': 'reg:linear', 'silent': True, 'nthread': 4}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "[0]\ttrain-rmse:0.420516\tvalid-rmse:0.417949\n",
      "Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.\n",
      "\n",
      "Will train until valid-rmse hasn't improved in 360 rounds.\n",
      "[300]\ttrain-rmse:0.023793\tvalid-rmse:0.023722\n",
      "[600]\ttrain-rmse:0.006579\tvalid-rmse:0.011484\n",
      "[900]\ttrain-rmse:0.004818\tvalid-rmse:0.011741\n",
      "Stopping. Best iteration:\n",
      "[591]\ttrain-rmse:0.006675\tvalid-rmse:0.01148\n",
      "\n",
      "[0]\ttrain-rmse:0.419813\tvalid-rmse:0.420787\n",
      "Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.\n",
      "\n",
      "Will train until valid-rmse hasn't improved in 360 rounds.\n",
      "[300]\ttrain-rmse:0.023759\tvalid-rmse:0.02565\n",
      "[600]\ttrain-rmse:0.006628\tvalid-rmse:0.012122\n",
      "[900]\ttrain-rmse:0.004786\tvalid-rmse:0.012115\n",
      "Stopping. Best iteration:\n",
      "[739]\ttrain-rmse:0.00563\tvalid-rmse:0.01206\n",
      "\n",
      "[0]\ttrain-rmse:0.419961\tvalid-rmse:0.420189\n",
      "Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.\n",
      "\n",
      "Will train until valid-rmse hasn't improved in 360 rounds.\n",
      "[300]\ttrain-rmse:0.023638\tvalid-rmse:0.025156\n",
      "[600]\ttrain-rmse:0.006163\tvalid-rmse:0.012432\n",
      "[900]\ttrain-rmse:0.004408\tvalid-rmse:0.012191\n",
      "[1200]\ttrain-rmse:0.003417\tvalid-rmse:0.012182\n",
      "Stopping. Best iteration:\n",
      "[1069]\ttrain-rmse:0.003807\tvalid-rmse:0.012174\n",
      "\n",
      "[0]\ttrain-rmse:0.420253\tvalid-rmse:0.419024\n",
      "Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.\n",
      "\n",
      "Will train until valid-rmse hasn't improved in 360 rounds.\n",
      "[300]\ttrain-rmse:0.023808\tvalid-rmse:0.024696\n",
      "[600]\ttrain-rmse:0.006483\tvalid-rmse:0.010892\n",
      "[900]\ttrain-rmse:0.004668\tvalid-rmse:0.010741\n",
      "[1200]\ttrain-rmse:0.003535\tvalid-rmse:0.010781\n",
      "Stopping. Best iteration:\n",
      "[913]\ttrain-rmse:0.004603\tvalid-rmse:0.010731\n",
      "\n",
      "[0]\ttrain-rmse:0.419486\tvalid-rmse:0.42207\n",
      "Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.\n",
      "\n",
      "Will train until valid-rmse hasn't improved in 360 rounds.\n",
      "[300]\ttrain-rmse:0.02389\tvalid-rmse:0.024229\n",
      "[600]\ttrain-rmse:0.006856\tvalid-rmse:0.010381\n",
      "[900]\ttrain-rmse:0.00496\tvalid-rmse:0.010253\n",
      "[1200]\ttrain-rmse:0.003777\tvalid-rmse:0.010266\n",
      "Stopping. Best iteration:\n",
      "[1022]\ttrain-rmse:0.004413\tvalid-rmse:0.010229\n",
      "\n",
      "\n",
      "CV LOSS: 0.000129050223541496 \n",
      "\n"
     ]
    }
   ],
   "source": [
    "# 开始训练\n",
    "pred_xgb_a = xgb_cv(df_trn, df_tst, params_xgb, fit_params,df_trn.columns.tolist()[1:-1], 5, 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 得到预测结果\n",
    "df_tst_a['收率'] = pred_xgb_a['pred'].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>样本id</th>\n",
       "      <th>A1</th>\n",
       "      <th>A2</th>\n",
       "      <th>A3</th>\n",
       "      <th>A4</th>\n",
       "      <th>A5</th>\n",
       "      <th>A6</th>\n",
       "      <th>A7</th>\n",
       "      <th>A8</th>\n",
       "      <th>A9</th>\n",
       "      <th>...</th>\n",
       "      <th>B6</th>\n",
       "      <th>B7</th>\n",
       "      <th>B8</th>\n",
       "      <th>B9</th>\n",
       "      <th>B10</th>\n",
       "      <th>B11</th>\n",
       "      <th>B12</th>\n",
       "      <th>B13</th>\n",
       "      <th>B14</th>\n",
       "      <th>收率</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>sample_1656</td>\n",
       "      <td>300</td>\n",
       "      <td>NaN</td>\n",
       "      <td>405.0</td>\n",
       "      <td>700</td>\n",
       "      <td>6:00:00</td>\n",
       "      <td>29</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>8:00:00</td>\n",
       "      <td>...</td>\n",
       "      <td>79</td>\n",
       "      <td>17:00:00</td>\n",
       "      <td>45</td>\n",
       "      <td>17:00-18:30</td>\n",
       "      <td>18:30-20:00</td>\n",
       "      <td>20:00-21:00</td>\n",
       "      <td>1200</td>\n",
       "      <td>0.15</td>\n",
       "      <td>400</td>\n",
       "      <td>0.905592</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>sample_1548</td>\n",
       "      <td>300</td>\n",
       "      <td>NaN</td>\n",
       "      <td>405.0</td>\n",
       "      <td>700</td>\n",
       "      <td>12:30:00</td>\n",
       "      <td>39</td>\n",
       "      <td>12:50:00</td>\n",
       "      <td>80.0</td>\n",
       "      <td>14:20:00</td>\n",
       "      <td>...</td>\n",
       "      <td>65</td>\n",
       "      <td>10:00:00</td>\n",
       "      <td>45</td>\n",
       "      <td>12:00-13:00</td>\n",
       "      <td>14:00-15:30</td>\n",
       "      <td>NaN</td>\n",
       "      <td>800</td>\n",
       "      <td>0.15</td>\n",
       "      <td>385</td>\n",
       "      <td>0.879489</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>sample_769</td>\n",
       "      <td>300</td>\n",
       "      <td>NaN</td>\n",
       "      <td>405.0</td>\n",
       "      <td>700</td>\n",
       "      <td>6:00:00</td>\n",
       "      <td>80</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>8:00:00</td>\n",
       "      <td>...</td>\n",
       "      <td>80</td>\n",
       "      <td>17:00:00</td>\n",
       "      <td>45</td>\n",
       "      <td>17:00-20:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1200</td>\n",
       "      <td>0.15</td>\n",
       "      <td>440</td>\n",
       "      <td>0.934109</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>sample_1881</td>\n",
       "      <td>300</td>\n",
       "      <td>NaN</td>\n",
       "      <td>405.0</td>\n",
       "      <td>700</td>\n",
       "      <td>22:00:00</td>\n",
       "      <td>29</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0:00:00</td>\n",
       "      <td>...</td>\n",
       "      <td>80</td>\n",
       "      <td>9:00:00</td>\n",
       "      <td>45</td>\n",
       "      <td>9:00-10:30</td>\n",
       "      <td>10:30-12:00</td>\n",
       "      <td>12:00-13:00</td>\n",
       "      <td>1200</td>\n",
       "      <td>0.15</td>\n",
       "      <td>400</td>\n",
       "      <td>0.903917</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>sample_1807</td>\n",
       "      <td>300</td>\n",
       "      <td>NaN</td>\n",
       "      <td>405.0</td>\n",
       "      <td>700</td>\n",
       "      <td>22:00:00</td>\n",
       "      <td>30</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0:00:00</td>\n",
       "      <td>...</td>\n",
       "      <td>79</td>\n",
       "      <td>9:00:00</td>\n",
       "      <td>45</td>\n",
       "      <td>9:00-10:30</td>\n",
       "      <td>10:30-12:00</td>\n",
       "      <td>12:00-13:00</td>\n",
       "      <td>1200</td>\n",
       "      <td>0.15</td>\n",
       "      <td>400</td>\n",
       "      <td>0.928389</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 44 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          样本id   A1  A2     A3   A4        A5  A6        A7    A8        A9  \\\n",
       "0  sample_1656  300 NaN  405.0  700   6:00:00  29       NaN   NaN   8:00:00   \n",
       "1  sample_1548  300 NaN  405.0  700  12:30:00  39  12:50:00  80.0  14:20:00   \n",
       "2   sample_769  300 NaN  405.0  700   6:00:00  80       NaN   NaN   8:00:00   \n",
       "3  sample_1881  300 NaN  405.0  700  22:00:00  29       NaN   NaN   0:00:00   \n",
       "4  sample_1807  300 NaN  405.0  700  22:00:00  30       NaN   NaN   0:00:00   \n",
       "\n",
       "     ...     B6        B7  B8           B9          B10          B11   B12  \\\n",
       "0    ...     79  17:00:00  45  17:00-18:30  18:30-20:00  20:00-21:00  1200   \n",
       "1    ...     65  10:00:00  45  12:00-13:00  14:00-15:30          NaN   800   \n",
       "2    ...     80  17:00:00  45  17:00-20:00          NaN          NaN  1200   \n",
       "3    ...     80   9:00:00  45   9:00-10:30  10:30-12:00  12:00-13:00  1200   \n",
       "4    ...     79   9:00:00  45   9:00-10:30  10:30-12:00  12:00-13:00  1200   \n",
       "\n",
       "    B13  B14        收率  \n",
       "0  0.15  400  0.905592  \n",
       "1  0.15  385  0.879489  \n",
       "2  0.15  440  0.934109  \n",
       "3  0.15  400  0.903917  \n",
       "4  0.15  400  0.928389  \n",
       "\n",
       "[5 rows x 44 columns]"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_tst_a.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
